added image processing tool using openai gpt-4o-mini multimodal model

frdel · Aug 22, 2024 · c12c02c · c12c02c
1 parent f5f2a02
commit c12c02c
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 0 deletions.
diff --git a/prompts/agent.tools.md b/prompts/agent.tools.md
@@ -214,3 +214,47 @@ When writing own code, ALWAYS put print/log statements inside and at the end of
     }
 }
 ~~~
+
+### image_processing_tool:
+
+Process images to get insights from them.
+Provide the "image_paths" list[strings] argument with the paths to the image files. Image paths can be local or remote (remote image paths start with 'http').
+Provide also a "query" argument to describe what to analyze or compare.
+The tool will return text data summarizing the image(s) or extracted data from the image(s) depending on the query.
+Use this tool for tasks like OCR (Optical Character Recognition) or image analysis or web page screenshot style analysis or image comparison.
+
+**Example usage**:
+
+~~~json
+{
+  "thoughts": [
+    "The user provided an image...",
+    "I will use the image_processing_tool to analyze the image..."
+  ],
+  "tool_name": "image_processing_tool",
+  "tool_args": {
+    "query": "Please analyze the following image and provide feedback.",
+    "image_paths": ["path/to/image.jpg"]
+  }
+}
+~~~
+
+2. Compare images
+
+~~~json
+{
+  "thoughts": [
+    "The user provided an image...",
+    "I will use the image_processing_tool to compare images..."
+  ],
+  "tool_name": "image_processing_tool",
+  "tool_args": {
+    "query": "Please analyze the following images. Which one is ...?",
+    "image_paths": [
+      "path/to/image.jpg",
+      "path/to/different/picture.png",
+      "https://example.com/image.jpg"
+    ]
+  }
+}
+~~~
diff --git a/python/helpers/process_image.py b/python/helpers/process_image.py
@@ -0,0 +1,33 @@
+from openai import OpenAI
+import models
+
+def process_image(query: str, image_urls: list, model_name="gpt-4o-mini", api_key=None):
+
+    api_key = api_key or models.get_api_key("openai")
+
+    if not api_key:
+        raise ValueError("The image processing tool requires an openai api key.")
+
+    client = OpenAI(api_key=api_key)
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": query},
+                *[
+                    {"type": "image_url", "image_url": {"url": url}}
+                    for url in image_urls
+                ]
+            ]
+        }
+    ]
+
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+    )
+
+    result = response.choices[0].message.content  # only the text is returned
+
+    return result
diff --git a/python/tools/image_processing_tool.py b/python/tools/image_processing_tool.py
@@ -0,0 +1,39 @@
+from agent import Agent
+from python.helpers.tool import Tool, Response
+from python.helpers import files
+from python.helpers.process_image import process_image
+from python.helpers.print_style import PrintStyle
+import base64
+import os
+
+class ImageProcessingTool(Tool):
+
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def execute(self, query: str, **kwargs):
+
+        image_paths = self.args["image_paths"]
+
+        if not image_paths or not isinstance(image_paths, list):
+            raise ValueError("The image_paths is either empty, None, or not a valid list of strings.")
+
+        processed_image_paths = []
+
+        for image_path in image_paths:
+            if not image_path.startswith("http"):
+                if not os.path.exists(image_path):
+                    raise FileNotFoundError(f"The local file '{image_path}' not found.")
+                with open(image_path, "rb") as image_file:
+                    base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+                    image_path = f"data:image/jpeg;base64,{base64_image}"
+            processed_image_paths.append(image_path)
+
+        content = process_image(query, processed_image_paths)
+
+        # if self.agent.handle_intervention(content): 
+        #     return Response(message="", break_loop=False)  # wait for intervention and handle it, if paused
+
+        # Return the response
+        return Response(message=content, break_loop=False)