alfulanny commited on
Commit
0106d0b
·
verified ·
1 Parent(s): f86436f

Create vision_tools.py

Browse files
Files changed (1) hide show
  1. vision_tools.py +81 -0
vision_tools.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vision tools: image captioning using Hugging Face Inference API with a local fallback.
3
+
4
+ Functions:
5
+ - `caption_image(path)`: returns a short caption for an image file.
6
+ """
7
+ from typing import Optional
8
+ import os
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ try:
14
+ from huggingface_hub import InferenceApi
15
+ except Exception:
16
+ InferenceApi = None
17
+
18
+ try:
19
+ from transformers import pipeline
20
+ except Exception:
21
+ pipeline = None
22
+
23
+ try:
24
+ from smolagents import tool
25
+ except Exception:
26
+ tool = None
27
+
28
+
29
+ def caption_image(path: str, model: str = "nlpconnect/vit-gpt2-image-captioning") -> str:
30
+ """Caption an image at `path` using HF Inference API when possible, otherwise local pipeline if available.
31
+
32
+ Returns a short textual caption or an error string.
33
+ """
34
+ if not os.path.exists(path):
35
+ return "(file not found)"
36
+
37
+ hf_token = os.environ.get("HF_TOKEN")
38
+ # Try Inference API first
39
+ if hf_token and InferenceApi is not None:
40
+ try:
41
+ client = InferenceApi(repo_id=model, token=hf_token)
42
+ with open(path, "rb") as f:
43
+ out = client(inputs=f)
44
+ # InferenceApi for image-to-text may return text or structure
45
+ if isinstance(out, dict) and "generated_text" in out:
46
+ return out["generated_text"].strip()
47
+ if isinstance(out, list) and len(out) > 0:
48
+ first = out[0]
49
+ if isinstance(first, dict) and "generated_text" in first:
50
+ return first["generated_text"].strip()
51
+ return str(first)
52
+ return str(out)
53
+ except Exception as e:
54
+ logger.warning("HF Inference image captioning failed: %s", e)
55
+
56
+ # Local pipeline fallback (may not be installed or suitable for large models)
57
+ if pipeline is not None:
58
+ try:
59
+ pipe = pipeline("image-to-text", model=model)
60
+ res = pipe(path)
61
+ if isinstance(res, list) and len(res) > 0:
62
+ return res[0].get("generated_text", str(res[0]))
63
+ return str(res)
64
+ except Exception as e:
65
+ logger.warning("Local pipeline image captioning failed: %s", e)
66
+
67
+ return "(image captioning unavailable)"
68
+
69
+
70
+ # Export a smolagents-wrapped tool if possible
71
+ if tool is not None:
72
+ try:
73
+ @tool
74
+ def caption_image_tool(path: str, model: str = "nlpconnect/vit-gpt2-image-captioning") -> str:
75
+ return caption_image(path, model=model)
76
+ except Exception:
77
+ caption_image_tool = caption_image
78
+ else:
79
+ caption_image_tool = caption_image
80
+
81
+ __all__ = ["caption_image", "caption_image_tool"]