modelscope · drcege · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -59,8 +59,8 @@ process:
       radius: 2                                               # radius of blur kernel
   - image_captioning_from_gpt4v_mapper:                     # generate samples whose texts are generated based on gpt-4-visison and the image
       mode: 'description'                                     # mode of text generated from images, can be one of ['resoning', 'description', 'conversation', 'custom']
-      api_key: ''                                             # the API key to authenticate the request
-      max_token: 500                                          # the maximum number of tokens to generate. Default is 500.
+      api_key: null                                           # the API key to authenticate the request
+      max_tokens: 500                                         # the maximum number of tokens to generate. Default is 500.
       temperature: 1.0                                        # controls the randomness of the output (range from 0 to 1). Default is 0.
       system_prompt: ''                                       # a string prompt used to set the context of a conversation and provide global guidance or rules for the gpt4-vision so that it can  generate responses in the expected way. If `mode` set to `custom`, the parameter will be used
       user_prompt: ''                                         # a string prompt to guide the generation of gpt4-vision for each samples. It's "" in default, which means no prompt provided

diff --git a/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.py b/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.py
@@ -1,6 +1,5 @@
 import copy
 
-import requests
 from jsonargparse.typing import ClosedUnitInterval
 from loguru import logger
 
@@ -9,6 +8,7 @@
                                         load_image_byte,
                                         remove_non_special_tokens,
                                         remove_special_tokens)
+from data_juicer.utils.model_utils import call_gpt_vision_api
 
 from ..base_op import OPERATORS, Mapper
 from ..op_fusion import LOADED_IMAGES
@@ -23,75 +23,6 @@
 }
 
 
-def call_gpt_vision_api(api_key,
-                        system_prompt,
-                        user_prompt,
-                        base64_image,
-                        max_tokens=500,
-                        temperature=1.0,
-                        model='gpt-4-vision-preview'):
-    api_url = 'https://api.openai.com/v1/chat/completions'
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {api_key}'
-    }
-    data = {
-        'model':
-        model,
-        'messages': [{
-            'role': 'system',
-            'content': system_prompt
-        }, {
-            'role':
-            'user',
-            'content': [{
-                'type': 'text',
-                'text': user_prompt
-            }, {
-                'type': 'image_url',
-                'image_url': {
-                    'url': f'data:image/jpeg;base64,{base64_image}',
-                    'detail': 'low'
-                }
-            }]
-        }],
-        'max_tokens':
-        max_tokens,
-        'temperature':
-        temperature
-    }
-    try:
-        response = requests.post(api_url, headers=headers, json=data)
-        response.raise_for_status()
-        result = response.json()
-
-        if 'choices' in result and result['choices']:
-            return result['choices'][0]['text']
-        else:
-            logger.warning('No results returned from the API, return None.')
-            return None
-
-    except requests.exceptions.HTTPError as errh:
-        if errh.response.status_code == 401:
-            logger.warning('Invalid API key provided.')
-        elif errh.response.status_code == 429:
-            logger.warning(
-                'API request limit has been reached. Please try again later.')
-        else:
-            logger.warning(f'HTTP error occurred: {errh}')
-    except requests.exceptions.ConnectionError:
-        logger.warning('Network error occurred. Please check your connection.')
-    except requests.exceptions.Timeout:
-        logger.warning('The request timed out. Please try again later.')
-    except requests.exceptions.RequestException as err:
-        logger.warningt(f'An error occurred: {err}')
-    except Exception as e:
-        logger.warning(f'An unexpected error occurred: {e}')
-
-    logger.warning('API request failed, return None.')
-    return None
-
-
 @OPERATORS.register_module('image_captioning_from_gpt4v_mapper')
 @LOADED_IMAGES.register_module('image_captioning_from_gpt4v_mapper')
 class ImageCaptioningFromGPT4VMapper(Mapper):
@@ -100,8 +31,8 @@ class ImageCaptioningFromGPT4VMapper(Mapper):
 
     def __init__(self,
                  mode: str = 'description',
-                 api_key: str = '',
-                 max_token: int = 500,
+                 api_key: str = None,
+                 max_tokens: int = 500,
                  temperature: ClosedUnitInterval = 1.0,
                  system_prompt: str = '',
                  user_prompt: str = '',
@@ -116,7 +47,7 @@ def __init__(self,
         :param mode: mode of text generated from images, can be one of
             ['resoning', 'description', 'conversation', 'custom']
         :param api_key: the API key to authenticate the request.
-        :param max_token: the maximum number of tokens to generate.
+        :param max_tokens: the maximum number of tokens to generate.
             Default is 500.
         :param temperature: controls the randomness of the output (range
             from 0 to 1). Default is 0.
@@ -162,7 +93,7 @@ def __init__(self,
 
         self.mode = mode
         self.api_key = api_key
-        self.max_token = max_token
+        self.max_tokens = max_tokens
         self.temperature = temperature
         self.user_prompt = user_prompt
         self.user_prompt_key = user_prompt_key
@@ -219,10 +150,14 @@ def _process_single_sample(self, sample):
                 generated_text_single_chunk = []
                 for image_key in loaded_image_keys[offset:offset + img_count]:
                     image = images[image_key]
-                    res = call_gpt_vision_api(self.api_key, self.system_prompt,
+                    res = call_gpt_vision_api(self.system_prompt,
                                               prompt_texts,
-                                              image_byte_to_base64(image),
-                                              self.max_token, self.temperature)
+                                              image_byte_to_base64(
+                                                  image, 'image/jpeg'),
+                                              api_key=self.api_key,
+                                              max_tokens=self.max_tokens,
+                                              temperature=self.temperature,
+                                              **self.extra_args)
                     generated_text_single_chunk.append(res)
                 if self.any_or_all == 'all' and not all(
                         generated_text_single_chunk):

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -1,5 +1,8 @@
 import base64
 import datetime
+import imghdr
+import io
+import mimetypes
 import os
 import re
 import shutil
@@ -109,13 +112,27 @@ def load_image_byte(path):
     return image_data
 
 
-def image_path_to_base64(image_path):
+def image_path_to_base64(image_path, mime_type=None):
     with open(image_path, 'rb') as image_file:
-        return base64.b64encode(image_file.read()).decode('utf-8')
+        image_byte = image_file.read()
+    return image_byte_to_base64(image_byte, mime_type)
 
 
-def image_byte_to_base64(image_byte):
-    return base64.b64encode(image_byte).decode('utf-8')
+def image_byte_to_base64(image_byte, mime_type=None):
+    image_base64 = base64.b64encode(image_byte).decode('utf-8')
+
+    if mime_type is None:
+        return image_base64
+
+    if mime_type == 'auto':
+        # guess correct mime_type
+        image_type = imghdr.what(None, h=image_byte)
+        mime_type = mimetypes.types_map.get(f'.{image_type}')
+        if mime_type is None:
+            mime_type = 'image/jpeg'
+
+    # https://www.rfc-editor.org/rfc/rfc2397
+    return f'data:{mime_type};base64,{image_base64}'
 
 
 def pil_to_opencv(pil_image):
@@ -175,6 +192,29 @@ def load_video(path):
     return container
 
 
+def video_path_to_base64(video_path, frame_num=None, fps=None, mime_type=None):
+    duration = get_video_duration(video_path)
+
+    if fps is not None:
+        valid_frame_num = round(duration * fps)
+        if frame_num is not None:
+            valid_frame_num = min(valid_frame_num, frame_num)
+    else:
+        valid_frame_num = frame_num or 0
+
+    frames_base64 = []
+    if frame_num > 0:
+        frames = extract_video_frames_uniformly(video_path, valid_frame_num)
+        for frame in frames:
+            img = frame.to_image()
+            buf = io.BytesIO()
+            img.save(buf, format='JPEG')
+            buf.seek(0)
+            frame_base64 = image_byte_to_base64(buf.read(), mime_type)
+            frames_base64.append(frame_base64)
+    return frames_base64
+
+
 def get_video_duration(input_video: Union[str, av.container.InputContainer],
                        video_stream_index=0):
     """

diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
@@ -1,9 +1,10 @@
 import fnmatch
 import os
 from functools import partial
-from typing import Optional, Union
+from typing import Any, List, Optional, Union
 
 import multiprocess as mp
+import requests
 import wget
 from loguru import logger
 
@@ -14,8 +15,8 @@
 MODEL_ZOO = {}
 
 # Default cached models links for downloading
-MODEL_LINKS = 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/' \
-               'data_juicer/models/'
+MODEL_LINKS = ('https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
+               'data_juicer/models/')
 
 # Backup cached models links for downloading
 BACKUP_MODEL_LINKS = {
@@ -174,9 +175,9 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'):
         'pt': 'portuguese',
         'es': 'spanish'
     }
-    assert lang in nltk_to_punkt.keys(
-    ), 'lang must be one of the following: {}'.format(
-        list(nltk_to_punkt.keys()))
+    assert (lang in nltk_to_punkt.keys()
+            ), 'lang must be one of the following: {}'.format(
+                list(nltk_to_punkt.keys()))
     model_name = name_pattern.format(nltk_to_punkt[lang])
 
     logger.info('Loading nltk punkt split model...')
@@ -319,6 +320,7 @@ def __init__(self, config: Blip2Config) -> None:
             self.post_init()
 
     from transformers import AutoProcessor
+
     processor = AutoProcessor.from_pretrained(
         pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
     if return_model:
@@ -418,10 +420,11 @@ def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.5.0'):
     # decompress the compressed model if it's not decompressed
     def decompress_model(compressed_model_path):
         decompressed_model_path = compressed_model_path.replace('.zip', '')
-        if os.path.exists(decompressed_model_path) \
-                and os.path.isdir(decompressed_model_path):
+        if os.path.exists(decompressed_model_path) and os.path.isdir(
+                decompressed_model_path):
             return decompressed_model_path
         import zipfile
+
         with zipfile.ZipFile(compressed_model_path) as zf:
             zf.extractall(DJMC)
         return decompressed_model_path
@@ -506,6 +509,7 @@ def prepare_recognizeAnything_model(
     :param input_size: the input size of the model.
     """
     from ram.models import ram_plus
+
     logger.info('Loading recognizeAnything model...')
     try:
         model = ram_plus(pretrained=check_model(pretrained_model_name_or_path),
@@ -573,3 +577,95 @@ def get_model(model_key=None, rank=None):
         rank = 0 if rank is None else rank
         move_to_cuda(MODEL_ZOO[model_key], rank)
     return MODEL_ZOO[model_key]
+
+
+def call_gpt_vision_api(
+    system_prompt: str = '',
+    user_prompt: str = '',
+    images: Union[str, List[str], None] = None,
+    *,
+    api_key: str = None,
+    model: str = 'gpt-4-vision-preview',
+    max_tokens: int = 500,
+    temperature: float = 0.0,
+    **kwargs: Any,
+):
+    images = [images] if isinstance(images, str) else (images or [])
+
+    api_url = 'https://api.openai.com/v1/chat/completions'
+
+    if api_key is None:
+        api_key = os.getenv('OPENAI_API_KEY')
+    if api_key is None:
+        logger.error(
+            'The api_key must be set either by passing it to the function '
+            'call or by setting the OPENAI_API_KEY environment variable')
+        return ''
+
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_key}',
+    }
+    image_payload = [{
+        'type': 'image_url',
+        'image_url': {
+            'url': url,
+            'detail': 'low'
+        }
+    } for url in images]
+    data = {
+        'model':
+        model,
+        'messages': [
+            {
+                'role': 'system',
+                'content': system_prompt
+            },
+            {
+                'role':
+                'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': user_prompt
+                    },
+                    *image_payload,
+                ],
+            },
+        ],
+        'max_tokens':
+        max_tokens,
+        'temperature':
+        temperature,
+        **kwargs,
+    }
+
+    try:
+        response = requests.post(api_url, headers=headers, json=data)
+        response.raise_for_status()
+        result = response.json()
+
+        if 'choices' in result and result['choices']:
+            return result['choices'][0]['message']['content']
+        else:
+            logger.warning('No results returned from the API.')
+            return ''
+    except requests.exceptions.HTTPError as errh:
+        if errh.response.status_code == 401:
+            logger.warning('Invalid API key provided.')
+        elif errh.response.status_code == 429:
+            logger.warning(
+                'API request limit has been reached. Please try again later.')
+        else:
+            logger.warning(f'HTTP error occurred: {errh}')
+    except requests.exceptions.ConnectionError:
+        logger.warning('Network error occurred. Please check your connection.')
+    except requests.exceptions.Timeout:
+        logger.warning('The request timed out. Please try again later.')
+    except requests.exceptions.RequestException as err:
+        logger.warningt(f'An error occurred: {err}')
+    except Exception as e:
+        logger.warning(f'An unexpected error occurred: {e}')
+
+    logger.warning('API request failed.')
+    return ''
diff --git a/tools/mm_eval/__init__.py b/tools/mm_eval/__init__.py