Spaces:
Running
Running
| """ | |
| Vision tools: image captioning using Hugging Face Inference API with a local fallback. | |
| Functions: | |
| - `caption_image(path)`: returns a short caption for an image file. | |
| """ | |
| from typing import Optional | |
| import os | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| try: | |
| from huggingface_hub import InferenceApi | |
| except Exception: | |
| InferenceApi = None | |
| try: | |
| from transformers import pipeline | |
| except Exception: | |
| pipeline = None | |
| try: | |
| from smolagents import tool | |
| except Exception: | |
| tool = None | |
| def caption_image(path: str, model: str = "nlpconnect/vit-gpt2-image-captioning") -> str: | |
| """Caption an image at `path` using HF Inference API when possible, otherwise local pipeline if available. | |
| Returns a short textual caption or an error string. | |
| """ | |
| if not os.path.exists(path): | |
| return "(file not found)" | |
| hf_token = os.environ.get("HF_TOKEN") | |
| # Try Inference API first | |
| if hf_token and InferenceApi is not None: | |
| try: | |
| client = InferenceApi(repo_id=model, token=hf_token) | |
| with open(path, "rb") as f: | |
| out = client(inputs=f) | |
| # InferenceApi for image-to-text may return text or structure | |
| if isinstance(out, dict) and "generated_text" in out: | |
| return out["generated_text"].strip() | |
| if isinstance(out, list) and len(out) > 0: | |
| first = out[0] | |
| if isinstance(first, dict) and "generated_text" in first: | |
| return first["generated_text"].strip() | |
| return str(first) | |
| return str(out) | |
| except Exception as e: | |
| logger.warning("HF Inference image captioning failed: %s", e) | |
| # Local pipeline fallback (may not be installed or suitable for large models) | |
| if pipeline is not None: | |
| try: | |
| pipe = pipeline("image-to-text", model=model) | |
| res = pipe(path) | |
| if isinstance(res, list) and len(res) > 0: | |
| return res[0].get("generated_text", str(res[0])) | |
| return str(res) | |
| except Exception as e: | |
| logger.warning("Local pipeline image captioning failed: %s", e) | |
| return "(image captioning unavailable)" | |
| # Export a smolagents-wrapped tool if possible | |
| if tool is not None: | |
| try: | |
| def caption_image_tool(path: str, model: str = "nlpconnect/vit-gpt2-image-captioning") -> str: | |
| return caption_image(path, model=model) | |
| except Exception: | |
| caption_image_tool = caption_image | |
| else: | |
| caption_image_tool = caption_image | |
| __all__ = ["caption_image", "caption_image_tool"] | |