Skip to content

Commit

Permalink
added image processing tool using openai gpt-4o-mini multimodal model
Browse files Browse the repository at this point in the history
  • Loading branch information
vandyand committed Aug 22, 2024
1 parent f5f2a02 commit c12c02c
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 0 deletions.
44 changes: 44 additions & 0 deletions prompts/agent.tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,47 @@ When writing own code, ALWAYS put print/log statements inside and at the end of
}
}
~~~

### image_processing_tool:

Process images to get insights from them.
Provide the "image_paths" list[strings] argument with the paths to the image files. Image paths can be local or remote (remote image paths start with 'http').
Provide also a "query" argument to describe what to analyze or compare.
The tool will return text data summarizing the image(s) or extracted data from the image(s) depending on the query.
Use this tool for tasks like OCR (Optical Character Recognition) or image analysis or web page screenshot style analysis or image comparison.

**Example usage**:

~~~json
{
"thoughts": [
"The user provided an image...",
"I will use the image_processing_tool to analyze the image..."
],
"tool_name": "image_processing_tool",
"tool_args": {
"query": "Please analyze the following image and provide feedback.",
"image_paths": ["path/to/image.jpg"]
}
}
~~~

2. Compare images

~~~json
{
"thoughts": [
"The user provided an image...",
"I will use the image_processing_tool to compare images..."
],
"tool_name": "image_processing_tool",
"tool_args": {
"query": "Please analyze the following images. Which one is ...?",
"image_paths": [
"path/to/image.jpg",
"path/to/different/picture.png",
"https://example.com/image.jpg"
]
}
}
~~~
33 changes: 33 additions & 0 deletions python/helpers/process_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from openai import OpenAI
import models

def process_image(query: str, image_urls: list, model_name="gpt-4o-mini", api_key=None):

api_key = api_key or models.get_api_key("openai")

if not api_key:
raise ValueError("The image processing tool requires an openai api key.")

client = OpenAI(api_key=api_key)

messages = [
{
"role": "user",
"content": [
{"type": "text", "text": query},
*[
{"type": "image_url", "image_url": {"url": url}}
for url in image_urls
]
]
}
]

response = client.chat.completions.create(
model=model_name,
messages=messages,
)

result = response.choices[0].message.content # only the text is returned

return result
39 changes: 39 additions & 0 deletions python/tools/image_processing_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from agent import Agent
from python.helpers.tool import Tool, Response
from python.helpers import files
from python.helpers.process_image import process_image
from python.helpers.print_style import PrintStyle
import base64
import os

class ImageProcessingTool(Tool):

def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

def execute(self, query: str, **kwargs):

image_paths = self.args["image_paths"]

if not image_paths or not isinstance(image_paths, list):
raise ValueError("The image_paths is either empty, None, or not a valid list of strings.")

processed_image_paths = []

for image_path in image_paths:
if not image_path.startswith("http"):
if not os.path.exists(image_path):
raise FileNotFoundError(f"The local file '{image_path}' not found.")
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
image_path = f"data:image/jpeg;base64,{base64_image}"
processed_image_paths.append(image_path)

content = process_image(query, processed_image_paths)

# if self.agent.handle_intervention(content):
# return Response(message="", break_loop=False) # wait for intervention and handle it, if paused

# Return the response
return Response(message=content, break_loop=False)

0 comments on commit c12c02c

Please sign in to comment.