From 6947814c3fe1508b2ffccc5b9ad36d3c2bb8ce4c Mon Sep 17 00:00:00 2001 From: jagadeeswaran-zipstack Date: Thu, 9 Jan 2025 09:48:50 +0530 Subject: [PATCH 1/5] unifyied llmw adapters --- .../x2text/llm_whisperer/src/constants.py | 18 +- .../x2text/llm_whisperer/src/helper.py | 81 ++++++ .../x2text/llm_whisperer/src/llm_whisperer.py | 203 ++++++++++---- .../llm_whisperer/src/static/json_schema.json | 252 +++++++++++------- 4 files changed, 397 insertions(+), 157 deletions(-) create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py index 6b11d65b..87b11a49 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py @@ -18,6 +18,7 @@ class OutputModes(Enum): LINE_PRINTER = "line-printer" DUMP_TEXT = "dump-text" TEXT = "text" + LAYOUT_PRESERVING = "layout_preserving" class HTTPMethod(Enum): @@ -48,10 +49,13 @@ class WhispererEnv: LLMWhisperer's status API. Defaults to 30s MAX_POLLS: Total number of times to poll the status API. Set to -1 to poll indefinitely. Defaults to -1 + STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API + on failure during polling. Defaults to 5. """ POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL" MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS" + STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES" class WhispererConfig: @@ -66,6 +70,7 @@ class WhispererConfig: GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius" FORCE_TEXT_PROCESSING = "force_text_processing" LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance" + LINE_SPLITTER_STRATEGY = "line_splitter_strategy" HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" PAGES_TO_EXTRACT = "pages_to_extract" STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting" @@ -74,7 +79,12 @@ class WhispererConfig: PAGE_SEPARATOR = "page_seperator" MARK_VERTICAL_LINES = "mark_vertical_lines" MARK_HORIZONTAL_LINES = "mark_horizontal_lines" - + URL_IN_POST = "url_in_post" + TAG = "tag" + USE_WEBHOOK = "use_webhook" + WEBHOOK_METADATA = "webhook_metadata" + TEXT_ONLY = "text_only" + VERSION = "version" class WhisperStatus: """Values returned / used by /whisper-status endpoint.""" @@ -86,6 +96,7 @@ class WhisperStatus: # Used for async processing WHISPER_HASH = "whisper-hash" STATUS = "status" + WHISPER_HASH_V2 = "whisper_hash" class WhispererDefaults: @@ -95,6 +106,7 @@ class WhispererDefaults: GAUSSIAN_BLUR_RADIUS = 0.0 FORCE_TEXT_PROCESSING = False LINE_SPLITTER_TOLERANCE = 0.75 + LINE_SPLITTER_STRATEGY = "left-priority" HORIZONTAL_STRETCH_FACTOR = 1.0 POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) @@ -104,3 +116,7 @@ class WhispererDefaults: PAGE_SEPARATOR = "<<< >>>" MARK_VERTICAL_LINES = False MARK_HORIZONTAL_LINES = False + STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5)) + URL_IN_POST = False + TAG = "default" + TEXT_ONLY = False diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py new file mode 100644 index 00000000..11648674 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py @@ -0,0 +1,81 @@ +import logging +from typing import Any +from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import ( + Modes, + OutputModes, + WhispererConfig, + WhispererDefaults, +) +logger = logging.getLogger(__name__) + + +class LLMWhispererHelper: + + @staticmethod + def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]: + """Gets query params meant for /whisper endpoint. + + The params is filled based on the configuration passed. + + Returns: + dict[str, Any]: Query params + """ + params = { + WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value), + WhispererConfig.OUTPUT_MODE: config.get( + WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value + ), + WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get( + WhispererConfig.LINE_SPLITTER_TOLERANCE, + WhispererDefaults.LINE_SPLITTER_TOLERANCE, + ), + WhispererConfig.LINE_SPLITTER_STRATEGY: config.get( + WhispererConfig.LINE_SPLITTER_STRATEGY, + WhispererDefaults.LINE_SPLITTER_STRATEGY, + ), + WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get( + WhispererConfig.HORIZONTAL_STRETCH_FACTOR, + WhispererDefaults.HORIZONTAL_STRETCH_FACTOR, + ), + WhispererConfig.PAGES_TO_EXTRACT: config.get( + WhispererConfig.PAGES_TO_EXTRACT, + WhispererDefaults.PAGES_TO_EXTRACT, + ), + WhispererConfig.MARK_VERTICAL_LINES: config.get( + WhispererConfig.MARK_VERTICAL_LINES, + WhispererDefaults.MARK_VERTICAL_LINES, + ), + WhispererConfig.MARK_HORIZONTAL_LINES: config.get( + WhispererConfig.MARK_HORIZONTAL_LINES, + WhispererDefaults.MARK_HORIZONTAL_LINES, + ), + WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST, + WhispererConfig.PAGE_SEPARATOR: config.get( + WhispererConfig.PAGE_SEPARATOR, + WhispererDefaults.PAGE_SEPARATOR, + ), + # Not providing default value to maintain legacy compatablity + # these are optional params and identifiers for audit + WhispererConfig.TAG: config.get( + WhispererConfig.TAG, + WhispererDefaults.TAG, + ), + WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), + WhispererConfig.WEBHOOK_METADATA: config.get( + WhispererConfig.WEBHOOK_METADATA + ), + } + if params[WhispererConfig.MODE] == Modes.LOW_COST.value: + params.update( + { + WhispererConfig.MEDIAN_FILTER_SIZE: config.get( + WhispererConfig.MEDIAN_FILTER_SIZE, + WhispererDefaults.MEDIAN_FILTER_SIZE, + ), + WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get( + WhispererConfig.GAUSSIAN_BLUR_RADIUS, + WhispererDefaults.GAUSSIAN_BLUR_RADIUS, + ), + } + ) + return params diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py index e753bed8..64dd9661 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -26,6 +26,7 @@ WhispererHeader, WhisperStatus, ) +from unstract.sdk.adapters.x2text.llm_whisperer.src.helper import LLMWhispererHelper from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter from unstract.sdk.constants import MimeType from unstract.sdk.file_storage import FileStorage, FileStorageProvider @@ -34,25 +35,43 @@ class LLMWhisperer(X2TextAdapter): + _version = "v2" def __init__(self, settings: dict[str, Any]): super().__init__("LLMWhisperer") self.config = settings + self.config["version"] = settings.get(WhispererConfig.VERSION, "v2") + LLMWhisperer._version = settings.get(WhispererConfig.VERSION, "v2") + + V1_NAME = "LLMWhisperer" + V1_DESCRIPTION = "LLMWhisperer X2Text" + V1_ICON = "/icons/adapter-icons/LLMWhisperer.png" + + V2_ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + V2_NAME = "LLMWhisperer V2" + V2_DESCRIPTION = "LLMWhisperer V2 X2Text" + V2_ICON = "/icons/adapter-icons/LLMWhispererV2.png" @staticmethod def get_id() -> str: - return "llmwhisperer|0a1647f0-f65f-410d-843b-3d979c78350e" - - @staticmethod - def get_name() -> str: - return "LLMWhisperer" - - @staticmethod - def get_description() -> str: - return "LLMWhisperer X2Text" - - @staticmethod - def get_icon() -> str: - return "/icons/adapter-icons/LLMWhisperer.png" + return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + + @classmethod + def get_name(cls) -> str: + if cls._version == "v2": + return cls.V2_NAME + return cls.V1_NAME + + @classmethod + def get_description(cls) -> str: + if cls._version == "v2": + return cls.V2_DESCRIPTION + return cls.V1_DESCRIPTION + + @classmethod + def get_icon(cls) -> str: + if cls._version == "v2": + return cls.V2_ICON + return cls.V1_ICON @staticmethod def get_json_schema() -> str: @@ -95,24 +114,23 @@ def _make_request( Returns: Response: Response from the request """ - llm_whisperer_svc_url = ( - f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}" - ) + # Determine version and set appropriate URL + version = self.config.get("version", "v1") + base_url = (f"{self.config.get(WhispererConfig.URL)}/api/v2/{request_endpoint}" + if version == "v2" + else f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}" + ) + if not headers: headers = self._get_request_headers() try: response: Response if request_method == HTTPMethod.GET: - response = requests.get( - url=llm_whisperer_svc_url, headers=headers, params=params - ) + response = requests.get(url=base_url, headers=headers, params=params) elif request_method == HTTPMethod.POST: response = requests.post( - url=llm_whisperer_svc_url, - headers=headers, - params=params, - data=data, + url=base_url, headers=headers, params=params, data=data ) else: raise ExtractorError(f"Unsupported request method: {request_method}") @@ -120,7 +138,7 @@ def _make_request( except ConnectionError as e: logger.error(f"Adapter error: {e}") raise ExtractorError( - "Unable to connect to LLMWhisperer service, please check the URL" + "Unable to connect to LLMWhisperer service, please check the URL", ) except Timeout as e: msg = "Request to LLMWhisperer has timed out" @@ -213,61 +231,87 @@ def test_connection(self) -> bool: return True def _check_status_until_ready( - self, whisper_hash: str, headers: dict[str, Any], params: dict[str, Any] + + self, + whisper_hash: str = "", + headers: dict[str, Any] = None, + params: dict[str, Any] = None, ) -> WhisperStatus: - """Checks the extraction status by polling. + """Checks the extraction status by polling for both v1 and v2. Polls the /whisper-status endpoint in fixed intervals of env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times controlled by env: ADAPTER_LLMW_MAX_POLLS. Args: - whisper_hash (str): Identifier for the extraction, - returned by LLMWhisperer + version (str): Version of the LLMWhisperer API (either 'v1' or 'v2') + config (Optional[dict[str, Any]]): Configuration for v2 (None for v1) + whisper_hash (str): Identifier for the extraction, returned by LLMWhisperer headers (dict[str, Any]): Headers to pass for the status check params (dict[str, Any]): Params to pass for the status check Returns: WhisperStatus: Status of the extraction """ + version = self.config['version'] POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL MAX_POLLS = WhispererDefaults.MAX_POLLS + STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES if version == "v2" else 0 + status_retry_count = 0 request_count = 0 - # Check status in fixed intervals upto max poll count. while True: request_count += 1 logger.info( - f"Checking status with interval: {POLL_INTERVAL}s" - f", request count: {request_count} [max: {MAX_POLLS}]" + f"Checking status{' for whisper-hash ' if version == 'v2' else ''}" + f"'{whisper_hash}' with interval: {POLL_INTERVAL}s, request count: " + f"{request_count} [max: {MAX_POLLS}]" ) + + # Make request based on version status_response = self._make_request( request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.STATUS, headers=headers, params=params, ) + if status_response.status_code == 200: status_data = status_response.json() status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN) - logger.info(f"Whisper status for {whisper_hash}: {status}") + logger.info(f"Whisper status for '{whisper_hash}': {status}") if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]: break else: - raise ExtractorError( - "Error checking LLMWhisperer status: " - f"{status_response.status_code} - {status_response.text}" - ) + if version == "v2" and status_retry_count >= STATUS_RETRY_THRESHOLD: + raise ExtractorError( + f"Error checking LLMWhisperer status for whisper-hash " + f"'{whisper_hash}': {status_response.text}" + ) + elif version == "v2": + status_retry_count += 1 + logger.warning( + f"Whisper status for '{whisper_hash}' failed " + f"{status_retry_count} time(s), retrying... " + f"[threshold: {STATUS_RETRY_THRESHOLD}]: {status_response.text}" + ) + else: # v1 error handling + raise ExtractorError( + "Error checking LLMWhisperer status: " + f"{status_response.status_code} - {status_response.text}" + ) - # Exit with error if max poll count is reached if request_count >= MAX_POLLS: raise ExtractorError( - "Unable to extract text after attempting" f" {request_count} times" + f"Unable to extract text for whisper-hash '{whisper_hash}' " + f"after attempting {request_count} times" ) + time.sleep(POLL_INTERVAL) return status + def _extract_async(self, whisper_hash: str) -> str: """Makes an async extraction with LLMWhisperer. @@ -280,12 +324,16 @@ def _extract_async(self, whisper_hash: str) -> str: str: Extracted contents from the file """ logger.info(f"Extracting async for whisper hash: {whisper_hash}") - + version = self.config['version'] headers: dict[str, Any] = self._get_request_headers() - params = { + params =({ WhisperStatus.WHISPER_HASH: whisper_hash, WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON, - } + } if version == 'v1' + else { + WhisperStatus.WHISPER_HASH_V2: whisper_hash, + WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY, + }) # Polls in fixed intervals and checks status self._check_status_until_ready( @@ -312,22 +360,43 @@ def _send_whisper_request( fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), enable_highlight: bool = False, ) -> requests.Response: + """Sends a whisper request for both v1 and v2. + + Args: + version (str): Version of the LLMWhisperer API (either 'v1' or 'v2') + input_file_path (str): Path to the input file to be processed + fs (FileStorage): File storage object to read the file + enable_highlight (bool): Whether to enable highlight (only for v1) + + Returns: + requests.Response: Response from the whisper request + """ + version = self.config['version'] + config = self.config + params = {} headers = self._get_request_headers() + if version == "v1": + params = self._get_whisper_params(enable_highlight) + elif version == "v2": + params = LLMWhispererHelper.get_whisperer_params(config) + else: + raise ValueError("Unsupported version. Only 'v1' and 'v2' are allowed.") + headers["Content-Type"] = "application/octet-stream" - params = self._get_whisper_params(enable_highlight) - response: requests.Response try: + input_file_data = fs.read(input_file_path, "rb") response = self._make_request( request_method=HTTPMethod.POST, request_endpoint=WhispererEndpoint.WHISPER, headers=headers, params=params, - data=fs.read(path=input_file_path, mode="rb"), + data=input_file_data, ) except OSError as e: logger.error(f"OS error while reading {input_file_path}: {e}") raise ExtractorError(str(e)) + return response def _extract_text_from_response( @@ -337,10 +406,12 @@ def _extract_text_from_response( fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), ) -> str: output_json = {} + version = self.config['version'] if response.status_code == 200: output_json = response.json() elif response.status_code == 202: - whisper_hash = response.json().get(WhisperStatus.WHISPER_HASH) + whisper_hash_key = WhisperStatus.WHISPER_HASH_V2 if version == "v2" else WhisperStatus.WHISPER_HASH + whisper_hash = response.json().get(whisper_hash_key) output_json = self._extract_async(whisper_hash=whisper_hash) else: raise ExtractorError("Couldn't extract text from file") @@ -348,7 +419,8 @@ def _extract_text_from_response( self._write_output_to_file( output_json=output_json, output_file_path=Path(output_file_path), fs=fs ) - return output_json.get("text", "") + output_key = "text" if version == "v1" else "result_text" + return output_json.get(output_key, "") def _write_output_to_file( self, @@ -369,7 +441,9 @@ def _write_output_to_file( ExtractorError: If there is an error while writing the output file. """ try: - text_output = output_json.get("text", "") + version = self.config['version'] + output_key = "text" if version == "v1" else "result_text" + text_output = output_json.get(output_key, "") logger.info(f"Writing output to {output_file_path}") fs.write( path=output_file_path, @@ -423,22 +497,35 @@ def process( Defaults to None. Returns: - str: Extracted text + TextExtractionResult: Extracted text along with metadata. """ + if self.config['version'] == "v2": + # V2 logic + response: requests.Response = self._send_whisper_request( + input_file_path, fs=fs + ) + response_text = response.text + response_dict = json.loads(response_text) + metadata = TextExtractionMetadata( + whisper_hash=response_dict.get(WhisperStatus.WHISPER_HASH_V2, "") + ) + else: + # V1 logic + response: requests.Response = self._send_whisper_request( + input_file_path, + fs, + bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)), + ) - response: requests.Response = self._send_whisper_request( - input_file_path, - fs, - bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)), - ) + metadata = TextExtractionMetadata( + whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + ) - metadata = TextExtractionMetadata( - whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + extracted_text = self._extract_text_from_response( + output_file_path, response, fs ) return TextExtractionResult( - extracted_text=self._extract_text_from_response( - output_file_path, response, fs - ), + extracted_text=extracted_text, extraction_metadata=metadata, ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json index 2bccb688..d4bde9ea 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json @@ -1,13 +1,23 @@ { - "title": "LLMWhisperer v1 Text Extractor", + "title": "LLMWhisperer Text Extractor", "type": "object", "required": [ "adapter_name", "unstract_key", - "url" + "url", + "version" ], - "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.", "properties": { + "version": { + "type": "string", + "title": "Version", + "enum": [ + "v1", + "v2" + ], + "default": "v2", + "description": "Select the version of LLMWhisperer to use." + }, "adapter_name": { "type": "string", "title": "Name", @@ -18,120 +28,166 @@ "type": "string", "title": "URL", "format": "uri", - "default": "https://llmwhisperer-api.unstract.com", - "description": "Provide the URL of the LLMWhisperer service. Please note that this version of LLMWhisperer is deprecated." + "default": "https://llmwhisperer-api.us-central.unstract.com", + "description": "Provide the URL of the LLMWhisperer service." }, "unstract_key": { "type": "string", "title": "Unstract Key", "format": "password", - "description": "API key obtained from the [Unstract developer portal](https://unstract-api-resource.developer.azure-api.net)" - }, - "mode": { - "type": "string", - "title": "Mode", - "enum": [ - "native_text", - "low_cost", - "high_quality", - "form" - ], - "default": "form", - "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)" - }, - "output_mode": { - "type": "string", - "title": "Output Mode", - "enum": [ - "line-printer", - "dump-text", - "text" - ], - "default": "line-printer", - "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" - }, - - "line_splitter_tolerance": { - "type": "number", - "title": "Line Splitter Tolerance", - "default": 0.4, - "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi column layout with text in each column that is not aligned." - }, - "horizontal_stretch_factor": { - "type": "number", - "title": "Horizontal Stretch Factor", - "default": 1.0, - "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." - }, - "pages_to_extract": { - "type": "string", - "title": "Page number(s) or range to extract", - "default": "", - "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", - "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." - }, - "page_seperator": { - "type": "string", - "title": "Page separator", - "default": "<<< >>>", - "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + "description": "API key obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)" } }, - "if": { - "anyOf": [ - { + "allOf": [ + { + "if": { "properties": { - "mode": { - "const": "low_cost" + "version": { + "const": "v1" } } }, - { + "then": { + "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.", "properties": { "mode": { - "const": "high_quality" + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)" + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "line-printer", + "dump-text", + "text" + ], + "default": "line-printer", + "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" + }, + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi-column layout with text in each column that is not aligned." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi-column text merges with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<< >>>", + "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + } + }, + "required": [ + "mode", + "output_mode" + ] + } + }, + { + "if": { + "properties": { + "version": { + "const": "v2" } } }, - { + "then": { "properties": { "mode": { - "const": "form" + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)." + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "layout_preserving", + "text" + ], + "default": "layout_preserving", + "description": "Output format, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" + }, + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height." + }, + "line_splitter_strategy": { + "type": "string", + "title": "Line Splitter Strategy", + "default": "left-priority", + "description": "An advanced option for customizing the line splitting process." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi-column text merges with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<<", + "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator." + }, + "tag": { + "type": "string", + "title": "Tag", + "default": "default", + "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross-referencing in usage reports." + }, + "use_webhook": { + "type": "string", + "title": "Webhook", + "default": "", + "description": "The webhook's name which should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint." + }, + "webhook_metadata": { + "type": "string", + "title": "Webhook Metadata", + "default": "", + "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint." } } } - ] - }, - "then": { - "properties": { - "median_filter_size": { - "type": "integer", - "title": "Median Filter Size", - "default": 0, - "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." - }, - "gaussian_blur_radius": { - "type": "number", - "title": "Gaussian Blur Radius", - "default": 0.0, - "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." - }, - "mark_vertical_lines": { - "type": "boolean", - "title": "Mark Vertical Lines", - "default": false, - "description": "Detect vertical lines in the document and replicate the same using text (using \"|\" symbol). Use this for displaying tables with borders." - }, - "mark_horizontal_lines": { - "type": "boolean", - "title": "Mark Horizontal Lines", - "default": false, - "description": "Detect horizontal lines in the document and replicate the same using text (using \"-\" symbol). Use this for displaying tables with borders and other horizontal serperators found in the document." - } - }, - "required": [ - "median_filter_size", - "gaussian_blur_radius" - ] - } + } + ] } From b771e943f103313fb38eccaaccb0ace2fd2f73e3 Mon Sep 17 00:00:00 2001 From: jagadeeswaran-zipstack Date: Mon, 13 Jan 2025 16:39:47 +0530 Subject: [PATCH 2/5] updated read me and unified adapter names --- .../adapters/x2text/llm_whisperer/README.md | 56 ++++++++- .../x2text/llm_whisperer/src/llm_whisperer.py | 43 +++---- .../llm_whisperer/src/static/json_schema.json | 106 +++++++++++++++++- 3 files changed, 170 insertions(+), 35 deletions(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md index 0c1a9ea1..484b5979 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md @@ -4,7 +4,55 @@ The below env variables are resolved by LLMWhisperer adapter -| Variable | Description | -| ---------------------------- | -------------------------------------------------------------------------------------------- | -| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | -| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | +| Variable | Description | +| ---------------------------- | --------------------------------------------------------------------------------- | +| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | +| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | + +--- + +## id: llm_whisperer_apis_changelog + +# Changelog + +## Version 2.0.0 + +:::warning +This version of the API is not backward compatible with the previous version. +::: + +### API endpoint + +- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2` + +### Global change in parameter naming + +- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency. + +### Whisper parameters + +#### Added + +- `mode` (str, optional): The processing mode. +- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document. +- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document. +- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process. +- `lang` (str, optional): The language of the document. +- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes. +- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes. +- `use_webhook` (str, optional): The name of the webhook to call after the document is processed. +- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed. + +#### Removed + +- `timeout` (int, optional): The timeout for API requests. _There is no sync mode now. All requests are async._ +- `force_text_processing` (bool, optional): Whether to force text processing. _This is feature is removed_ +- `ocr_provider` (str, optional): The OCR provider to use. _This is superseded by `mode`_ +- `processing_mode` (str, optional): The processing mode. _This is superseded by `mode`_ +- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. _Feature is removed. Data still available and set back when retrieve is called_ + +### New features + +#### Webhooks + +- Added support for webhooks. You can now register a webhook and use it to receive the processed document. diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py index 64dd9661..7a230d17 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -42,36 +42,27 @@ def __init__(self, settings: dict[str, Any]): self.config["version"] = settings.get(WhispererConfig.VERSION, "v2") LLMWhisperer._version = settings.get(WhispererConfig.VERSION, "v2") - V1_NAME = "LLMWhisperer" - V1_DESCRIPTION = "LLMWhisperer X2Text" - V1_ICON = "/icons/adapter-icons/LLMWhisperer.png" - V2_ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" - V2_NAME = "LLMWhisperer V2" - V2_DESCRIPTION = "LLMWhisperer V2 X2Text" - V2_ICON = "/icons/adapter-icons/LLMWhispererV2.png" + ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + NAME = "LLMWhisperer V2" + DESCRIPTION = "LLMWhisperer V2 X2Text" + ICON = "/icons/adapter-icons/LLMWhispererV2.png" @staticmethod def get_id() -> str: - return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" - - @classmethod - def get_name(cls) -> str: - if cls._version == "v2": - return cls.V2_NAME - return cls.V1_NAME - - @classmethod - def get_description(cls) -> str: - if cls._version == "v2": - return cls.V2_DESCRIPTION - return cls.V1_DESCRIPTION - - @classmethod - def get_icon(cls) -> str: - if cls._version == "v2": - return cls.V2_ICON - return cls.V1_ICON + return LLMWhisperer.ID + + @staticmethod + def get_name() -> str: + return LLMWhisperer.NAME + + @staticmethod + def get_description() -> str: + return LLMWhisperer.DESCRIPTION + + @staticmethod + def get_icon() -> str: + return LLMWhisperer.ICON @staticmethod def get_json_schema() -> str: diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json index d4bde9ea..d0316d59 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json @@ -99,10 +99,63 @@ "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." } }, - "required": [ - "mode", - "output_mode" - ] + "if": { + "anyOf": [ + { + "properties": { + "mode": { + "const": "low_cost" + } + } + }, + { + "properties": { + "mode": { + "const": "high_quality" + } + } + }, + { + "properties": { + "mode": { + "const": "form" + } + } + } + ] + }, + "then": { + "properties": { + "median_filter_size": { + "type": "integer", + "title": "Median Filter Size", + "default": 0, + "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." + }, + "gaussian_blur_radius": { + "type": "number", + "title": "Gaussian Blur Radius", + "default": 0.0, + "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." + }, + "mark_vertical_lines": { + "type": "boolean", + "title": "Mark Vertical Lines", + "default": false, + "description": "Detect vertical lines in the document and replicate the same using text (using \"|\" symbol). Use this for displaying tables with borders." + }, + "mark_horizontal_lines": { + "type": "boolean", + "title": "Mark Horizontal Lines", + "default": false, + "description": "Detect horizontal lines in the document and replicate the same using text (using \"-\" symbol). Use this for displaying tables with borders and other horizontal serperators found in the document." + } + }, + "required": [ + "median_filter_size", + "gaussian_blur_radius" + ] + } } }, { @@ -168,6 +221,18 @@ "default": "<<<", "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator." }, + "mark_vertical_lines": { + "type": "boolean", + "title": "Mark vertical lines", + "default": false, + "description": "States whether to reproduce vertical lines in the document." + }, + "mark_horizontal_lines": { + "type": "boolean", + "title": "Mark horizontal lines", + "default": false, + "description": "States whether to reproduce horizontal lines in the document." + }, "tag": { "type": "string", "title": "Tag", @@ -186,8 +251,39 @@ "default": "", "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint." } + }, + "if": { + "anyOf": [ + { + "properties": { + "mode": { + "const": "low_cost" + } + } + } + ] + }, + "then": { + "properties": { + "median_filter_size": { + "type": "integer", + "title": "Median Filter Size", + "default": 0, + "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." + }, + "gaussian_blur_radius": { + "type": "number", + "title": "Gaussian Blur Radius", + "default": 0.0, + "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." + } + }, + "required": [ + "median_filter_size", + "gaussian_blur_radius" + ] } } } ] -} +} \ No newline at end of file From 5243ab8e7151c922762941c447092fb73dab30b6 Mon Sep 17 00:00:00 2001 From: jagadeeswaran-zipstack Date: Mon, 13 Jan 2025 16:50:57 +0530 Subject: [PATCH 3/5] env seperation for v1 and v2 --- .../x2text/llm_whisperer/src/__init__.py | 2 +- .../x2text/llm_whisperer/src/constants.py | 4 + .../x2text/llm_whisperer/src/llm_whisperer.py | 4 +- .../x2text/llm_whisperer_v2/README.md | 58 --- .../x2text/llm_whisperer_v2/pyproject.toml | 25 -- .../x2text/llm_whisperer_v2/src/__init__.py | 9 - .../x2text/llm_whisperer_v2/src/constants.py | 107 ----- .../x2text/llm_whisperer_v2/src/helper.py | 388 ------------------ .../llm_whisperer_v2/src/llm_whisperer_v2.py | 93 ----- .../src/static/json_schema.json | 144 ------- 10 files changed, 7 insertions(+), 827 deletions(-) delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py index ba216498..c4f02191 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py @@ -2,7 +2,7 @@ metadata = { "name": LLMWhisperer.__name__, - "version": "1.0.0", + "version": "2.0.0", "adapter": LLMWhisperer, "description": "LLMWhisperer X2Text adapter", "is_active": True, diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py index 87b11a49..d0c60286 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py @@ -55,6 +55,8 @@ class WhispererEnv: POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL" MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS" + POLL_INTERVAL_V2 = "ADAPTER_LLMW_POLL_INTERVAL_V2" + MAX_POLLS_V2 = "ADAPTER_LLMW_MAX_POLLS_V2" STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES" @@ -110,6 +112,8 @@ class WhispererDefaults: HORIZONTAL_STRETCH_FACTOR = 1.0 POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) + POLL_INTERVAL_V2 = int(os.getenv(WhispererEnv.POLL_INTERVAL_V2, 30)) + MAX_POLLS_V2 = int(os.getenv(WhispererEnv.MAX_POLLS_V2, 30)) PAGES_TO_EXTRACT = "" ADD_LINE_NOS = True OUTPUT_JSON = True diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py index 7a230d17..59b61a1d 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -245,8 +245,8 @@ def _check_status_until_ready( WhisperStatus: Status of the extraction """ version = self.config['version'] - POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL - MAX_POLLS = WhispererDefaults.MAX_POLLS + POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL_V2 if version == "v2" else WhispererDefaults.POLL_INTERVAL + MAX_POLLS = WhispererDefaults.MAX_POLLS_V2 if version == "v2" else WhispererDefaults.MAX_POLLS STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES if version == "v2" else 0 status_retry_count = 0 request_count = 0 diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md deleted file mode 100644 index f33810b3..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# Unstract LLMWWhisperer v2 X2Text Adapter - -## Env variables - -The below env variables are resolved by LLMWhisperer adapter - -| Variable | Description | -| ---------------------------- | -------------------------------------------------------------------------------------------- | -| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | -| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | - - ---- -id: llm_whisperer_apis_changelog ---- - -# Changelog - -## Version 2.0.0 - -:::warning -This version of the API is not backward compatible with the previous version. -::: - -### API endpoint - -- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2` - -### Global change in parameter naming - -- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency. - -### Whisper parameters - -#### Added -- `mode` (str, optional): The processing mode. -- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document. -- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document. -- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process. -- `lang` (str, optional): The language of the document. -- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes. -- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes. -- `use_webhook` (str, optional): The name of the webhook to call after the document is processed. -- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed. - -#### Removed -- `timeout` (int, optional): The timeout for API requests. *There is no sync mode now. All requests are async.* -- `force_text_processing` (bool, optional): Whether to force text processing. *This is feature is removed* -- `ocr_provider` (str, optional): The OCR provider to use. *This is superseded by `mode`* -- `processing_mode` (str, optional): The processing mode. *This is superseded by `mode`* -- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. *Feature is removed. Data still available and set back when retrieve is called* - - -### New features - -#### Webhooks - -- Added support for webhooks. You can now register a webhook and use it to receive the processed document. diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml deleted file mode 100644 index bf7ad3a4..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml +++ /dev/null @@ -1,25 +0,0 @@ -[build-system] -requires = ["pdm-backend"] -build-backend = "pdm.backend" - - -[project] -name = "unstract-llm_whisperer-x2text-v2" -version = "0.0.1" -description = "V2 of LLMWhisperer X2Text Adapter" -authors = [ - {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, -] -dependencies = [ -] -requires-python = ">=3.9" -readme = "README.md" -classifiers = [ - "Programming Language :: Python" -] -license = {text = "MIT"} - -[tool.pdm.build] -includes = ["src"] -package-dir = "src" -# source-includes = ["tests"] diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py deleted file mode 100644 index 14240c6a..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .llm_whisperer_v2 import LLMWhispererV2 - -metadata = { - "name": LLMWhispererV2.__name__, - "version": "1.0.0", - "adapter": LLMWhispererV2, - "description": "LLMWhispererV2 X2Text adapter", - "is_active": True, -} diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py deleted file mode 100644 index 7e2d7dcf..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -from enum import Enum - - -class Modes(Enum): - NATIVE_TEXT = "native_text" - LOW_COST = "low_cost" - HIGH_QUALITY = "high_quality" - FORM = "form" - - -class OutputModes(Enum): - LAYOUT_PRESERVING = "layout_preserving" - TEXT = "text" - - -class HTTPMethod(Enum): - GET = "GET" - POST = "POST" - - -class WhispererHeader: - UNSTRACT_KEY = "unstract-key" - - -class WhispererEndpoint: - """Endpoints available at LLMWhisperer service.""" - - TEST_CONNECTION = "test-connection" - WHISPER = "whisper" - STATUS = "whisper-status" - RETRIEVE = "whisper-retrieve" - - -class WhispererEnv: - """Env variables for LLMWhisperer. - - Can be used to alter behaviour at runtime. - - Attributes: - POLL_INTERVAL: Time in seconds to wait before polling - LLMWhisperer's status API. Defaults to 30s - MAX_POLLS: Total number of times to poll the status API. - Set to -1 to poll indefinitely. Defaults to -1 - STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API - on failure during polling. Defaults to 5. - """ - - POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL" - MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS" - STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES" - - -class WhispererConfig: - """Dictionary keys used to configure LLMWhisperer service.""" - - URL = "url" - MODE = "mode" - OUTPUT_MODE = "output_mode" - UNSTRACT_KEY = "unstract_key" - MEDIAN_FILTER_SIZE = "median_filter_size" - GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius" - LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance" - LINE_SPLITTER_STRATEGY = "line_splitter_strategy" - HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" - PAGES_TO_EXTRACT = "pages_to_extract" - MARK_VERTICAL_LINES = "mark_vertical_lines" - MARK_HORIZONTAL_LINES = "mark_horizontal_lines" - PAGE_SEPARATOR = "page_seperator" - URL_IN_POST = "url_in_post" - TAG = "tag" - USE_WEBHOOK = "use_webhook" - WEBHOOK_METADATA = "webhook_metadata" - TEXT_ONLY = "text_only" - - -class WhisperStatus: - """Values returned / used by /whisper-status endpoint.""" - - PROCESSING = "processing" - PROCESSED = "processed" - DELIVERED = "delivered" - UNKNOWN = "unknown" - # Used for async processing - WHISPER_HASH = "whisper_hash" - STATUS = "status" - - -class WhispererDefaults: - """Defaults meant for LLMWhisperer.""" - - MEDIAN_FILTER_SIZE = 0 - GAUSSIAN_BLUR_RADIUS = 0.0 - FORCE_TEXT_PROCESSING = False - LINE_SPLITTER_TOLERANCE = 0.75 - LINE_SPLITTER_STRATEGY = "left-priority" - HORIZONTAL_STRETCH_FACTOR = 1.0 - POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) - MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) - STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5)) - PAGES_TO_EXTRACT = "" - PAGE_SEPARATOR = "<<<" - MARK_VERTICAL_LINES = False - MARK_HORIZONTAL_LINES = False - URL_IN_POST = False - TAG = "default" - TEXT_ONLY = False diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py deleted file mode 100644 index dfea856c..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py +++ /dev/null @@ -1,388 +0,0 @@ -import json -import logging -import time -from pathlib import Path -from typing import Any, Optional - -import requests -from requests import Response -from requests.exceptions import ConnectionError, HTTPError, Timeout - -from unstract.sdk.adapters.exceptions import ExtractorError -from unstract.sdk.adapters.utils import AdapterUtils -from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import ( - HTTPMethod, - Modes, - OutputModes, - WhispererConfig, - WhispererDefaults, - WhispererEndpoint, - WhispererHeader, - WhisperStatus, -) -from unstract.sdk.constants import MimeType -from unstract.sdk.file_storage import FileStorage, FileStorageProvider - -logger = logging.getLogger(__name__) - - -class LLMWhispererHelper: - @staticmethod - def get_request_headers(config: dict[str, Any]) -> dict[str, Any]: - """Obtains the request headers to authenticate with LLMWhisperer. - - Returns: - str: Request headers - """ - return { - "accept": MimeType.JSON, - WhispererHeader.UNSTRACT_KEY: config.get(WhispererConfig.UNSTRACT_KEY), - } - - @staticmethod - def make_request( - config: dict[str, Any], - request_method: HTTPMethod, - request_endpoint: str, - headers: Optional[dict[str, Any]] = None, - params: Optional[dict[str, Any]] = None, - data: Optional[Any] = None, - ) -> Response: - """Makes a request to LLMWhisperer service. - - Args: - request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST - request_endpoint (str): LLMWhisperer endpoint to hit - headers (Optional[dict[str, Any]], optional): Headers to pass. - Defaults to None. - params (Optional[dict[str, Any]], optional): Query params to pass. - Defaults to None. - data (Optional[Any], optional): Data to pass in case of POST. - Defaults to None. - - Returns: - Response: Response from the request - """ - llm_whisperer_svc_url = ( - f"{config.get(WhispererConfig.URL)}" f"/api/v2/{request_endpoint}" - ) - if not headers: - headers = LLMWhispererHelper.get_request_headers(config=config) - - try: - response: Response - if request_method == HTTPMethod.GET: - response = requests.get( - url=llm_whisperer_svc_url, headers=headers, params=params - ) - elif request_method == HTTPMethod.POST: - response = requests.post( - url=llm_whisperer_svc_url, - headers=headers, - params=params, - data=data, - ) - else: - raise ExtractorError( - f"Unsupported request method: {request_method}", status_code=500 - ) - response.raise_for_status() - except ConnectionError as e: - logger.error(f"Adapter error: {e}") - raise ExtractorError( - "Unable to connect to LLMWhisperer service, please check the URL", - actual_err=e, - status_code=503, - ) - except Timeout as e: - msg = "Request to LLMWhisperer has timed out" - logger.error(f"{msg}: {e}") - raise ExtractorError(msg, actual_err=e, status_code=504) - except HTTPError as e: - logger.error(f"Adapter error: {e}") - default_err = "Error while calling the LLMWhisperer service" - msg = AdapterUtils.get_msg_from_request_exc( - err=e, message_key="message", default_err=default_err - ) - raise ExtractorError(msg) - return response - - @staticmethod - def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]: - """Gets query params meant for /whisper endpoint. - - The params is filled based on the configuration passed. - - Returns: - dict[str, Any]: Query params - """ - params = { - WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value), - WhispererConfig.OUTPUT_MODE: config.get( - WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value - ), - WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get( - WhispererConfig.LINE_SPLITTER_TOLERANCE, - WhispererDefaults.LINE_SPLITTER_TOLERANCE, - ), - WhispererConfig.LINE_SPLITTER_STRATEGY: config.get( - WhispererConfig.LINE_SPLITTER_STRATEGY, - WhispererDefaults.LINE_SPLITTER_STRATEGY, - ), - WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get( - WhispererConfig.HORIZONTAL_STRETCH_FACTOR, - WhispererDefaults.HORIZONTAL_STRETCH_FACTOR, - ), - WhispererConfig.PAGES_TO_EXTRACT: config.get( - WhispererConfig.PAGES_TO_EXTRACT, - WhispererDefaults.PAGES_TO_EXTRACT, - ), - WhispererConfig.MARK_VERTICAL_LINES: config.get( - WhispererConfig.MARK_VERTICAL_LINES, - WhispererDefaults.MARK_VERTICAL_LINES, - ), - WhispererConfig.MARK_HORIZONTAL_LINES: config.get( - WhispererConfig.MARK_HORIZONTAL_LINES, - WhispererDefaults.MARK_HORIZONTAL_LINES, - ), - WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST, - WhispererConfig.PAGE_SEPARATOR: config.get( - WhispererConfig.PAGE_SEPARATOR, - WhispererDefaults.PAGE_SEPARATOR, - ), - # Not providing default value to maintain legacy compatablity - # these are optional params and identifiers for audit - WhispererConfig.TAG: config.get( - WhispererConfig.TAG, - WhispererDefaults.TAG, - ), - WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), - WhispererConfig.WEBHOOK_METADATA: config.get( - WhispererConfig.WEBHOOK_METADATA - ), - } - if params[WhispererConfig.MODE] == Modes.LOW_COST.value: - params.update( - { - WhispererConfig.MEDIAN_FILTER_SIZE: config.get( - WhispererConfig.MEDIAN_FILTER_SIZE, - WhispererDefaults.MEDIAN_FILTER_SIZE, - ), - WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get( - WhispererConfig.GAUSSIAN_BLUR_RADIUS, - WhispererDefaults.GAUSSIAN_BLUR_RADIUS, - ), - } - ) - return params - - @staticmethod - def check_status_until_ready( - config: dict[str, Any], - whisper_hash: str, - headers: dict[str, Any], - params: dict[str, Any], - ) -> WhisperStatus: - """Checks the extraction status by polling. - - Polls the /whisper-status endpoint in fixed intervals of - env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times - controlled by env: ADAPTER_LLMW_MAX_POLLS. - - Args: - whisper_hash (str): Identifier for the extraction, - returned by LLMWhisperer - headers (dict[str, Any]): Headers to pass for the status check - params (dict[str, Any]): Params to pass for the status check - - Returns: - WhisperStatus: Status of the extraction - """ - POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL - MAX_POLLS = WhispererDefaults.MAX_POLLS - STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES - status_retry_count = 0 - request_count = 0 - - # Check status in fixed intervals upto max poll count. - while True: - request_count += 1 - logger.info( - f"Checking status for whisper-hash '{whisper_hash}' with interval: " - f"{POLL_INTERVAL}s, request count: {request_count} [max: {MAX_POLLS}]" - ) - status_response = LLMWhispererHelper.make_request( - config=config, - request_method=HTTPMethod.GET, - request_endpoint=WhispererEndpoint.STATUS, - headers=headers, - params=params, - ) - if status_response.status_code == 200: - status_data = status_response.json() - status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN) - logger.info(f"Whisper status for '{whisper_hash}': {status}") - if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]: - break - else: - if status_retry_count >= STATUS_RETRY_THRESHOLD: - raise ExtractorError( - f"Error checking LLMWhisperer status for whisper-hash " - f"'{whisper_hash}': {status_response.text}" - ) - else: - status_retry_count += 1 - logger.warning( - f"Whisper status for '{whisper_hash}' failed " - f"{status_retry_count} time(s), retrying... " - f"[threshold: {STATUS_RETRY_THRESHOLD}]: {status_response.text}" - ) - - # Exit with error if max poll count is reached - if request_count >= MAX_POLLS: - raise ExtractorError( - f"Unable to extract text for whisper-hash '{whisper_hash}' " - f"after attempting {request_count} times" - ) - time.sleep(POLL_INTERVAL) - - return status - - @staticmethod - def extract_async(config: dict[str, Any], whisper_hash: str) -> dict[Any, Any]: - """Makes an async extraction with LLMWhisperer. - - Polls and checks the status first before proceeding to retrieve once. - - Args: - whisper_hash (str): Identifier of the extraction - - Returns: - str: Extracted contents from the file - """ - logger.info(f"Extracting async for whisper hash: {whisper_hash}") - - headers: dict[str, Any] = LLMWhispererHelper.get_request_headers(config) - params = { - WhisperStatus.WHISPER_HASH: whisper_hash, - WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY, - } - - # Polls in fixed intervals and checks status - LLMWhispererHelper.check_status_until_ready( - config=config, whisper_hash=whisper_hash, headers=headers, params=params - ) - - retrieve_response = LLMWhispererHelper.make_request( - config=config, - request_method=HTTPMethod.GET, - request_endpoint=WhispererEndpoint.RETRIEVE, - headers=headers, - params=params, - ) - if retrieve_response.status_code == 200: - return retrieve_response.json() - else: - raise ExtractorError( - "Error retrieving from LLMWhisperer: " - f"{retrieve_response.status_code} - {retrieve_response.text}" - ) - - @staticmethod - def send_whisper_request( - input_file_path: str, - config: dict[str, Any], - fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), - ) -> requests.Response: - headers = LLMWhispererHelper.get_request_headers(config) - headers["Content-Type"] = "application/octet-stream" - params = LLMWhispererHelper.get_whisperer_params(config) - - response: requests.Response - try: - input_file_data = fs.read(input_file_path, "rb") - response = LLMWhispererHelper.make_request( - config=config, - request_method=HTTPMethod.POST, - request_endpoint=WhispererEndpoint.WHISPER, - headers=headers, - params=params, - data=input_file_data, - ) - except OSError as e: - logger.error(f"OS error while reading {input_file_path}: {e}") - raise ExtractorError(str(e)) - return response - - @staticmethod - def extract_text_from_response( - config: dict[str, Any], - output_file_path: Optional[str], - response_dict: dict[str, Any], - response: Response, - fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), - ) -> str: - output_json = {} - if response.status_code == 200: - output_json = response.json() - elif response.status_code == 202: - whisper_hash = response_dict.get(WhisperStatus.WHISPER_HASH) - output_json = LLMWhispererHelper.extract_async( - config=config, whisper_hash=whisper_hash - ) - else: - raise ExtractorError("Couldn't extract text from file") - if output_file_path: - LLMWhispererHelper.write_output_to_file( - output_json=output_json, - output_file_path=Path(output_file_path), - fs=fs, - ) - return output_json.get("result_text", "") - - @staticmethod - def write_output_to_file( - output_json: dict, - output_file_path: Path, - fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), - ) -> None: - """Writes the extracted text and metadata to the specified output file - and metadata file. - - Args: - output_json (dict): The dictionary containing the extracted data, - with "text" as the key for the main content. - output_file_path (Path): The file path where the extracted text - should be written. - - Raises: - ExtractorError: If there is an error while writing the output file. - """ - try: - text_output = output_json.get("result_text", "") - logger.info(f"Writing output to {output_file_path}") - fs.write( - path=output_file_path, mode="w", data=text_output, encoding="utf-8" - ) - except Exception as e: - logger.error(f"Error while writing {output_file_path}: {e}") - raise ExtractorError(str(e)) - try: - # Define the directory of the output file and metadata paths - output_dir = output_file_path.parent - metadata_dir = output_dir / "metadata" - metadata_file_name = output_file_path.with_suffix(".json").name - metadata_file_path = metadata_dir / metadata_file_name - # Ensure the metadata directory exists - fs.mkdir(create_parents=True, path=metadata_dir) - # Remove the "result_text" key from the metadata - metadata = { - key: value for key, value in output_json.items() if key != "result_text" - } - metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4) - logger.info(f"Writing metadata to {metadata_file_path}") - fs.write( - path=metadata_file_path, mode="w", data=metadata_json, encoding="utf-8" - ) - except Exception as e: - logger.warn(f"Error while writing metadata to {metadata_file_path}: {e}") diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py deleted file mode 100644 index 94d6b246..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ /dev/null @@ -1,93 +0,0 @@ -import json -import logging -import os -from typing import Any, Optional - -import requests - -from unstract.sdk.adapters.x2text.constants import X2TextConstants -from unstract.sdk.adapters.x2text.dto import ( - TextExtractionMetadata, - TextExtractionResult, -) -from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import ( - HTTPMethod, - WhispererEndpoint, -) -from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.helper import LLMWhispererHelper -from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter -from unstract.sdk.file_storage import FileStorage, FileStorageProvider - -logger = logging.getLogger(__name__) - - -class LLMWhispererV2(X2TextAdapter): - def __init__(self, settings: dict[str, Any]): - super().__init__("LLMWhispererV2") - self.config = settings - - @staticmethod - def get_id() -> str: - return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" - - @staticmethod - def get_name() -> str: - return "LLMWhisperer V2" - - @staticmethod - def get_description() -> str: - return "LLMWhisperer V2 X2Text" - - @staticmethod - def get_icon() -> str: - return "/icons/adapter-icons/LLMWhispererV2.png" - - @staticmethod - def get_json_schema() -> str: - f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") - schema = f.read() - f.close() - return schema - - def test_connection(self) -> bool: - LLMWhispererHelper.make_request( - config=self.config, - request_method=HTTPMethod.GET, - request_endpoint=WhispererEndpoint.TEST_CONNECTION, - ) - return True - - def process( - self, - input_file_path: str, - output_file_path: Optional[str] = None, - fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), - **kwargs: dict[Any, Any], - ) -> TextExtractionResult: - """Used to extract text from documents. - - Args: - input_file_path (str): Path to file that needs to be extracted - output_file_path (Optional[str], optional): File path to write - extracted text into, if None doesn't write to a file. - Defaults to None. - - Returns: - str: Extracted text - """ - - response: requests.Response = LLMWhispererHelper.send_whisper_request( - input_file_path, self.config, fs=fs - ) - response_text = response.text - reponse_dict = json.loads(response_text) - metadata = TextExtractionMetadata( - whisper_hash=reponse_dict.get(X2TextConstants.WHISPER_HASH_V2, "") - ) - - return TextExtractionResult( - extracted_text=LLMWhispererHelper.extract_text_from_response( - self.config, output_file_path, reponse_dict, response, fs=fs - ), - extraction_metadata=metadata, - ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json deleted file mode 100644 index b2a62c4b..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json +++ /dev/null @@ -1,144 +0,0 @@ -{ - "title": "LLMWhisperer v2 Text Extractor", - "type": "object", - "required": [ - "adapter_name", - "unstract_key", - "url" - ], - "properties": { - "adapter_name": { - "type": "string", - "title": "Name", - "default": "llm-whisperer-v2", - "description": "Provide a unique name for this adapter instance. Example: LLMWhisperer 1" - }, - "url": { - "type": "string", - "title": "URL", - "format": "uri", - "default": "https://llmwhisperer-api.us-central.unstract.com", - "description": "Provide the base URL of the LLMWhisperer service based on your region, can be obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)." - }, - "unstract_key": { - "type": "string", - "title": "Unstract Key", - "format": "password", - "description": "API key obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)" - }, - "mode": { - "type": "string", - "title": "Mode", - "enum": [ - "native_text", - "low_cost", - "high_quality", - "form" - ], - "default": "form", - "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)." - }, - "output_mode": { - "type": "string", - "title": "Output Mode", - "enum": [ - "layout_preserving", - "text" - ], - "default": "layout_preserving", - "description": "Output format, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" - }, - "line_splitter_tolerance": { - "type": "number", - "title": "Line Splitter Tolerance", - "default": 0.4, - "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height" - }, - "line_splitter_strategy": { - "type": "string", - "title": "Line Splitter Strategy", - "default":"left-priority", - "description": "An advanced option for customizing the line splitting process." - }, - "horizontal_stretch_factor": { - "type": "number", - "title": "Horizontal Stretch Factor", - "default": 1.0, - "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." - }, - "pages_to_extract": { - "type": "string", - "title": "Page number(s) or range to extract", - "default": "", - "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", - "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." - }, - "page_seperator": { - "type": "string", - "title": "Page separator", - "default": "<<<", - "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator." - }, - "mark_vertical_lines": { - "type": "boolean", - "title": "Mark vertical lines", - "default": false, - "description": "States whether to reproduce vertical lines in the document." - }, - "mark_horizontal_lines": { - "type": "boolean", - "title": "Mark horizontal lines", - "default": false, - "description": "States whether to reproduce horizontal lines in the document." - }, - "tag": { - "type": "string", - "title": "Tag", - "default": "default", - "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports." - }, - "use_webhook": { - "type": "string", - "title": "Webhook", - "default": "", - "description": "The webhook's name which will should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint" - }, - "webhook_metadata": { - "type": "string", - "title": "Webhook Metadata", - "default": "", - "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint." - } - }, - "if": { - "anyOf": [ - { - "properties": { - "mode": { - "const": "low_cost" - } - } - } - ] - }, - "then": { - "properties": { - "median_filter_size": { - "type": "integer", - "title": "Median Filter Size", - "default": 0, - "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." - }, - "gaussian_blur_radius": { - "type": "number", - "title": "Gaussian Blur Radius", - "default": 0.0, - "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." - } - }, - "required": [ - "median_filter_size", - "gaussian_blur_radius" - ] - } -} From 4f235a5a9a109e79a1f72259c63662cc50ce3714 Mon Sep 17 00:00:00 2001 From: jagadeeswaran-zipstack Date: Mon, 13 Jan 2025 16:58:50 +0530 Subject: [PATCH 4/5] reverting version updated in init file --- src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py index c4f02191..ba216498 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py @@ -2,7 +2,7 @@ metadata = { "name": LLMWhisperer.__name__, - "version": "2.0.0", + "version": "1.0.0", "adapter": LLMWhisperer, "description": "LLMWhisperer X2Text adapter", "is_active": True, From 8ab74471e1da4e7c7c45f9ca6222fc97a55dd950 Mon Sep 17 00:00:00 2001 From: jagadeeswaran-zipstack Date: Thu, 16 Jan 2025 10:22:54 +0530 Subject: [PATCH 5/5] adapter name change --- .../sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py index 59b61a1d..9f3d862e 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -44,8 +44,8 @@ def __init__(self, settings: dict[str, Any]): ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" - NAME = "LLMWhisperer V2" - DESCRIPTION = "LLMWhisperer V2 X2Text" + NAME = "LLMWhisperer" + DESCRIPTION = "LLMWhisperer X2Text" ICON = "/icons/adapter-icons/LLMWhispererV2.png" @staticmethod