diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py index 6b11d65b..87b11a49 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py @@ -18,6 +18,7 @@ class OutputModes(Enum): LINE_PRINTER = "line-printer" DUMP_TEXT = "dump-text" TEXT = "text" + LAYOUT_PRESERVING = "layout_preserving" class HTTPMethod(Enum): @@ -48,10 +49,13 @@ class WhispererEnv: LLMWhisperer's status API. Defaults to 30s MAX_POLLS: Total number of times to poll the status API. Set to -1 to poll indefinitely. Defaults to -1 + STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API + on failure during polling. Defaults to 5. """ POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL" MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS" + STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES" class WhispererConfig: @@ -66,6 +70,7 @@ class WhispererConfig: GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius" FORCE_TEXT_PROCESSING = "force_text_processing" LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance" + LINE_SPLITTER_STRATEGY = "line_splitter_strategy" HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" PAGES_TO_EXTRACT = "pages_to_extract" STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting" @@ -74,7 +79,12 @@ class WhispererConfig: PAGE_SEPARATOR = "page_seperator" MARK_VERTICAL_LINES = "mark_vertical_lines" MARK_HORIZONTAL_LINES = "mark_horizontal_lines" - + URL_IN_POST = "url_in_post" + TAG = "tag" + USE_WEBHOOK = "use_webhook" + WEBHOOK_METADATA = "webhook_metadata" + TEXT_ONLY = "text_only" + VERSION = "version" class WhisperStatus: """Values returned / used by /whisper-status endpoint.""" @@ -86,6 +96,7 @@ class WhisperStatus: # Used for async processing WHISPER_HASH = "whisper-hash" STATUS = "status" + WHISPER_HASH_V2 = "whisper_hash" class WhispererDefaults: @@ -95,6 +106,7 @@ class WhispererDefaults: GAUSSIAN_BLUR_RADIUS = 0.0 FORCE_TEXT_PROCESSING = False LINE_SPLITTER_TOLERANCE = 0.75 + LINE_SPLITTER_STRATEGY = "left-priority" HORIZONTAL_STRETCH_FACTOR = 1.0 POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) @@ -104,3 +116,7 @@ class WhispererDefaults: PAGE_SEPARATOR = "<<< >>>" MARK_VERTICAL_LINES = False MARK_HORIZONTAL_LINES = False + STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5)) + URL_IN_POST = False + TAG = "default" + TEXT_ONLY = False diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py new file mode 100644 index 00000000..11648674 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py @@ -0,0 +1,81 @@ +import logging +from typing import Any +from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import ( + Modes, + OutputModes, + WhispererConfig, + WhispererDefaults, +) +logger = logging.getLogger(__name__) + + +class LLMWhispererHelper: + + @staticmethod + def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]: + """Gets query params meant for /whisper endpoint. + + The params is filled based on the configuration passed. + + Returns: + dict[str, Any]: Query params + """ + params = { + WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value), + WhispererConfig.OUTPUT_MODE: config.get( + WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value + ), + WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get( + WhispererConfig.LINE_SPLITTER_TOLERANCE, + WhispererDefaults.LINE_SPLITTER_TOLERANCE, + ), + WhispererConfig.LINE_SPLITTER_STRATEGY: config.get( + WhispererConfig.LINE_SPLITTER_STRATEGY, + WhispererDefaults.LINE_SPLITTER_STRATEGY, + ), + WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get( + WhispererConfig.HORIZONTAL_STRETCH_FACTOR, + WhispererDefaults.HORIZONTAL_STRETCH_FACTOR, + ), + WhispererConfig.PAGES_TO_EXTRACT: config.get( + WhispererConfig.PAGES_TO_EXTRACT, + WhispererDefaults.PAGES_TO_EXTRACT, + ), + WhispererConfig.MARK_VERTICAL_LINES: config.get( + WhispererConfig.MARK_VERTICAL_LINES, + WhispererDefaults.MARK_VERTICAL_LINES, + ), + WhispererConfig.MARK_HORIZONTAL_LINES: config.get( + WhispererConfig.MARK_HORIZONTAL_LINES, + WhispererDefaults.MARK_HORIZONTAL_LINES, + ), + WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST, + WhispererConfig.PAGE_SEPARATOR: config.get( + WhispererConfig.PAGE_SEPARATOR, + WhispererDefaults.PAGE_SEPARATOR, + ), + # Not providing default value to maintain legacy compatablity + # these are optional params and identifiers for audit + WhispererConfig.TAG: config.get( + WhispererConfig.TAG, + WhispererDefaults.TAG, + ), + WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), + WhispererConfig.WEBHOOK_METADATA: config.get( + WhispererConfig.WEBHOOK_METADATA + ), + } + if params[WhispererConfig.MODE] == Modes.LOW_COST.value: + params.update( + { + WhispererConfig.MEDIAN_FILTER_SIZE: config.get( + WhispererConfig.MEDIAN_FILTER_SIZE, + WhispererDefaults.MEDIAN_FILTER_SIZE, + ), + WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get( + WhispererConfig.GAUSSIAN_BLUR_RADIUS, + WhispererDefaults.GAUSSIAN_BLUR_RADIUS, + ), + } + ) + return params diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py index e753bed8..64dd9661 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -26,6 +26,7 @@ WhispererHeader, WhisperStatus, ) +from unstract.sdk.adapters.x2text.llm_whisperer.src.helper import LLMWhispererHelper from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter from unstract.sdk.constants import MimeType from unstract.sdk.file_storage import FileStorage, FileStorageProvider @@ -34,25 +35,43 @@ class LLMWhisperer(X2TextAdapter): + _version = "v2" def __init__(self, settings: dict[str, Any]): super().__init__("LLMWhisperer") self.config = settings + self.config["version"] = settings.get(WhispererConfig.VERSION, "v2") + LLMWhisperer._version = settings.get(WhispererConfig.VERSION, "v2") + + V1_NAME = "LLMWhisperer" + V1_DESCRIPTION = "LLMWhisperer X2Text" + V1_ICON = "/icons/adapter-icons/LLMWhisperer.png" + + V2_ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + V2_NAME = "LLMWhisperer V2" + V2_DESCRIPTION = "LLMWhisperer V2 X2Text" + V2_ICON = "/icons/adapter-icons/LLMWhispererV2.png" @staticmethod def get_id() -> str: - return "llmwhisperer|0a1647f0-f65f-410d-843b-3d979c78350e" - - @staticmethod - def get_name() -> str: - return "LLMWhisperer" - - @staticmethod - def get_description() -> str: - return "LLMWhisperer X2Text" - - @staticmethod - def get_icon() -> str: - return "/icons/adapter-icons/LLMWhisperer.png" + return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + + @classmethod + def get_name(cls) -> str: + if cls._version == "v2": + return cls.V2_NAME + return cls.V1_NAME + + @classmethod + def get_description(cls) -> str: + if cls._version == "v2": + return cls.V2_DESCRIPTION + return cls.V1_DESCRIPTION + + @classmethod + def get_icon(cls) -> str: + if cls._version == "v2": + return cls.V2_ICON + return cls.V1_ICON @staticmethod def get_json_schema() -> str: @@ -95,24 +114,23 @@ def _make_request( Returns: Response: Response from the request """ - llm_whisperer_svc_url = ( - f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}" - ) + # Determine version and set appropriate URL + version = self.config.get("version", "v1") + base_url = (f"{self.config.get(WhispererConfig.URL)}/api/v2/{request_endpoint}" + if version == "v2" + else f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}" + ) + if not headers: headers = self._get_request_headers() try: response: Response if request_method == HTTPMethod.GET: - response = requests.get( - url=llm_whisperer_svc_url, headers=headers, params=params - ) + response = requests.get(url=base_url, headers=headers, params=params) elif request_method == HTTPMethod.POST: response = requests.post( - url=llm_whisperer_svc_url, - headers=headers, - params=params, - data=data, + url=base_url, headers=headers, params=params, data=data ) else: raise ExtractorError(f"Unsupported request method: {request_method}") @@ -120,7 +138,7 @@ def _make_request( except ConnectionError as e: logger.error(f"Adapter error: {e}") raise ExtractorError( - "Unable to connect to LLMWhisperer service, please check the URL" + "Unable to connect to LLMWhisperer service, please check the URL", ) except Timeout as e: msg = "Request to LLMWhisperer has timed out" @@ -213,61 +231,87 @@ def test_connection(self) -> bool: return True def _check_status_until_ready( - self, whisper_hash: str, headers: dict[str, Any], params: dict[str, Any] + + self, + whisper_hash: str = "", + headers: dict[str, Any] = None, + params: dict[str, Any] = None, ) -> WhisperStatus: - """Checks the extraction status by polling. + """Checks the extraction status by polling for both v1 and v2. Polls the /whisper-status endpoint in fixed intervals of env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times controlled by env: ADAPTER_LLMW_MAX_POLLS. Args: - whisper_hash (str): Identifier for the extraction, - returned by LLMWhisperer + version (str): Version of the LLMWhisperer API (either 'v1' or 'v2') + config (Optional[dict[str, Any]]): Configuration for v2 (None for v1) + whisper_hash (str): Identifier for the extraction, returned by LLMWhisperer headers (dict[str, Any]): Headers to pass for the status check params (dict[str, Any]): Params to pass for the status check Returns: WhisperStatus: Status of the extraction """ + version = self.config['version'] POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL MAX_POLLS = WhispererDefaults.MAX_POLLS + STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES if version == "v2" else 0 + status_retry_count = 0 request_count = 0 - # Check status in fixed intervals upto max poll count. while True: request_count += 1 logger.info( - f"Checking status with interval: {POLL_INTERVAL}s" - f", request count: {request_count} [max: {MAX_POLLS}]" + f"Checking status{' for whisper-hash ' if version == 'v2' else ''}" + f"'{whisper_hash}' with interval: {POLL_INTERVAL}s, request count: " + f"{request_count} [max: {MAX_POLLS}]" ) + + # Make request based on version status_response = self._make_request( request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.STATUS, headers=headers, params=params, ) + if status_response.status_code == 200: status_data = status_response.json() status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN) - logger.info(f"Whisper status for {whisper_hash}: {status}") + logger.info(f"Whisper status for '{whisper_hash}': {status}") if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]: break else: - raise ExtractorError( - "Error checking LLMWhisperer status: " - f"{status_response.status_code} - {status_response.text}" - ) + if version == "v2" and status_retry_count >= STATUS_RETRY_THRESHOLD: + raise ExtractorError( + f"Error checking LLMWhisperer status for whisper-hash " + f"'{whisper_hash}': {status_response.text}" + ) + elif version == "v2": + status_retry_count += 1 + logger.warning( + f"Whisper status for '{whisper_hash}' failed " + f"{status_retry_count} time(s), retrying... " + f"[threshold: {STATUS_RETRY_THRESHOLD}]: {status_response.text}" + ) + else: # v1 error handling + raise ExtractorError( + "Error checking LLMWhisperer status: " + f"{status_response.status_code} - {status_response.text}" + ) - # Exit with error if max poll count is reached if request_count >= MAX_POLLS: raise ExtractorError( - "Unable to extract text after attempting" f" {request_count} times" + f"Unable to extract text for whisper-hash '{whisper_hash}' " + f"after attempting {request_count} times" ) + time.sleep(POLL_INTERVAL) return status + def _extract_async(self, whisper_hash: str) -> str: """Makes an async extraction with LLMWhisperer. @@ -280,12 +324,16 @@ def _extract_async(self, whisper_hash: str) -> str: str: Extracted contents from the file """ logger.info(f"Extracting async for whisper hash: {whisper_hash}") - + version = self.config['version'] headers: dict[str, Any] = self._get_request_headers() - params = { + params =({ WhisperStatus.WHISPER_HASH: whisper_hash, WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON, - } + } if version == 'v1' + else { + WhisperStatus.WHISPER_HASH_V2: whisper_hash, + WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY, + }) # Polls in fixed intervals and checks status self._check_status_until_ready( @@ -312,22 +360,43 @@ def _send_whisper_request( fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), enable_highlight: bool = False, ) -> requests.Response: + """Sends a whisper request for both v1 and v2. + + Args: + version (str): Version of the LLMWhisperer API (either 'v1' or 'v2') + input_file_path (str): Path to the input file to be processed + fs (FileStorage): File storage object to read the file + enable_highlight (bool): Whether to enable highlight (only for v1) + + Returns: + requests.Response: Response from the whisper request + """ + version = self.config['version'] + config = self.config + params = {} headers = self._get_request_headers() + if version == "v1": + params = self._get_whisper_params(enable_highlight) + elif version == "v2": + params = LLMWhispererHelper.get_whisperer_params(config) + else: + raise ValueError("Unsupported version. Only 'v1' and 'v2' are allowed.") + headers["Content-Type"] = "application/octet-stream" - params = self._get_whisper_params(enable_highlight) - response: requests.Response try: + input_file_data = fs.read(input_file_path, "rb") response = self._make_request( request_method=HTTPMethod.POST, request_endpoint=WhispererEndpoint.WHISPER, headers=headers, params=params, - data=fs.read(path=input_file_path, mode="rb"), + data=input_file_data, ) except OSError as e: logger.error(f"OS error while reading {input_file_path}: {e}") raise ExtractorError(str(e)) + return response def _extract_text_from_response( @@ -337,10 +406,12 @@ def _extract_text_from_response( fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), ) -> str: output_json = {} + version = self.config['version'] if response.status_code == 200: output_json = response.json() elif response.status_code == 202: - whisper_hash = response.json().get(WhisperStatus.WHISPER_HASH) + whisper_hash_key = WhisperStatus.WHISPER_HASH_V2 if version == "v2" else WhisperStatus.WHISPER_HASH + whisper_hash = response.json().get(whisper_hash_key) output_json = self._extract_async(whisper_hash=whisper_hash) else: raise ExtractorError("Couldn't extract text from file") @@ -348,7 +419,8 @@ def _extract_text_from_response( self._write_output_to_file( output_json=output_json, output_file_path=Path(output_file_path), fs=fs ) - return output_json.get("text", "") + output_key = "text" if version == "v1" else "result_text" + return output_json.get(output_key, "") def _write_output_to_file( self, @@ -369,7 +441,9 @@ def _write_output_to_file( ExtractorError: If there is an error while writing the output file. """ try: - text_output = output_json.get("text", "") + version = self.config['version'] + output_key = "text" if version == "v1" else "result_text" + text_output = output_json.get(output_key, "") logger.info(f"Writing output to {output_file_path}") fs.write( path=output_file_path, @@ -423,22 +497,35 @@ def process( Defaults to None. Returns: - str: Extracted text + TextExtractionResult: Extracted text along with metadata. """ + if self.config['version'] == "v2": + # V2 logic + response: requests.Response = self._send_whisper_request( + input_file_path, fs=fs + ) + response_text = response.text + response_dict = json.loads(response_text) + metadata = TextExtractionMetadata( + whisper_hash=response_dict.get(WhisperStatus.WHISPER_HASH_V2, "") + ) + else: + # V1 logic + response: requests.Response = self._send_whisper_request( + input_file_path, + fs, + bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)), + ) - response: requests.Response = self._send_whisper_request( - input_file_path, - fs, - bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)), - ) + metadata = TextExtractionMetadata( + whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + ) - metadata = TextExtractionMetadata( - whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + extracted_text = self._extract_text_from_response( + output_file_path, response, fs ) return TextExtractionResult( - extracted_text=self._extract_text_from_response( - output_file_path, response, fs - ), + extracted_text=extracted_text, extraction_metadata=metadata, ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json index 2bccb688..d4bde9ea 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json @@ -1,13 +1,23 @@ { - "title": "LLMWhisperer v1 Text Extractor", + "title": "LLMWhisperer Text Extractor", "type": "object", "required": [ "adapter_name", "unstract_key", - "url" + "url", + "version" ], - "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.", "properties": { + "version": { + "type": "string", + "title": "Version", + "enum": [ + "v1", + "v2" + ], + "default": "v2", + "description": "Select the version of LLMWhisperer to use." + }, "adapter_name": { "type": "string", "title": "Name", @@ -18,120 +28,166 @@ "type": "string", "title": "URL", "format": "uri", - "default": "https://llmwhisperer-api.unstract.com", - "description": "Provide the URL of the LLMWhisperer service. Please note that this version of LLMWhisperer is deprecated." + "default": "https://llmwhisperer-api.us-central.unstract.com", + "description": "Provide the URL of the LLMWhisperer service." }, "unstract_key": { "type": "string", "title": "Unstract Key", "format": "password", - "description": "API key obtained from the [Unstract developer portal](https://unstract-api-resource.developer.azure-api.net)" - }, - "mode": { - "type": "string", - "title": "Mode", - "enum": [ - "native_text", - "low_cost", - "high_quality", - "form" - ], - "default": "form", - "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)" - }, - "output_mode": { - "type": "string", - "title": "Output Mode", - "enum": [ - "line-printer", - "dump-text", - "text" - ], - "default": "line-printer", - "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" - }, - - "line_splitter_tolerance": { - "type": "number", - "title": "Line Splitter Tolerance", - "default": 0.4, - "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi column layout with text in each column that is not aligned." - }, - "horizontal_stretch_factor": { - "type": "number", - "title": "Horizontal Stretch Factor", - "default": 1.0, - "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." - }, - "pages_to_extract": { - "type": "string", - "title": "Page number(s) or range to extract", - "default": "", - "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", - "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." - }, - "page_seperator": { - "type": "string", - "title": "Page separator", - "default": "<<< >>>", - "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + "description": "API key obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)" } }, - "if": { - "anyOf": [ - { + "allOf": [ + { + "if": { "properties": { - "mode": { - "const": "low_cost" + "version": { + "const": "v1" } } }, - { + "then": { + "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.", "properties": { "mode": { - "const": "high_quality" + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)" + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "line-printer", + "dump-text", + "text" + ], + "default": "line-printer", + "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" + }, + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi-column layout with text in each column that is not aligned." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi-column text merges with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<< >>>", + "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + } + }, + "required": [ + "mode", + "output_mode" + ] + } + }, + { + "if": { + "properties": { + "version": { + "const": "v2" } } }, - { + "then": { "properties": { "mode": { - "const": "form" + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)." + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "layout_preserving", + "text" + ], + "default": "layout_preserving", + "description": "Output format, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)" + }, + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height." + }, + "line_splitter_strategy": { + "type": "string", + "title": "Line Splitter Strategy", + "default": "left-priority", + "description": "An advanced option for customizing the line splitting process." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi-column text merges with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<<", + "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator." + }, + "tag": { + "type": "string", + "title": "Tag", + "default": "default", + "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross-referencing in usage reports." + }, + "use_webhook": { + "type": "string", + "title": "Webhook", + "default": "", + "description": "The webhook's name which should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint." + }, + "webhook_metadata": { + "type": "string", + "title": "Webhook Metadata", + "default": "", + "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint." } } } - ] - }, - "then": { - "properties": { - "median_filter_size": { - "type": "integer", - "title": "Median Filter Size", - "default": 0, - "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." - }, - "gaussian_blur_radius": { - "type": "number", - "title": "Gaussian Blur Radius", - "default": 0.0, - "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." - }, - "mark_vertical_lines": { - "type": "boolean", - "title": "Mark Vertical Lines", - "default": false, - "description": "Detect vertical lines in the document and replicate the same using text (using \"|\" symbol). Use this for displaying tables with borders." - }, - "mark_horizontal_lines": { - "type": "boolean", - "title": "Mark Horizontal Lines", - "default": false, - "description": "Detect horizontal lines in the document and replicate the same using text (using \"-\" symbol). Use this for displaying tables with borders and other horizontal serperators found in the document." - } - }, - "required": [ - "median_filter_size", - "gaussian_blur_radius" - ] - } + } + ] }