From 6947814c3fe1508b2ffccc5b9ad36d3c2bb8ce4c Mon Sep 17 00:00:00 2001
From: jagadeeswaran-zipstack <jagadeeswaran@zipstack.com>
Date: Thu, 9 Jan 2025 09:48:50 +0530
Subject: [PATCH 1/5] unifyied llmw adapters

---
 .../x2text/llm_whisperer/src/constants.py     |  18 +-
 .../x2text/llm_whisperer/src/helper.py        |  81 ++++++
 .../x2text/llm_whisperer/src/llm_whisperer.py | 203 ++++++++++----
 .../llm_whisperer/src/static/json_schema.json | 252 +++++++++++-------
 4 files changed, 397 insertions(+), 157 deletions(-)
 create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py

diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
index 6b11d65b..87b11a49 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
@@ -18,6 +18,7 @@ class OutputModes(Enum):
     LINE_PRINTER = "line-printer"
     DUMP_TEXT = "dump-text"
     TEXT = "text"
+    LAYOUT_PRESERVING = "layout_preserving"
 
 
 class HTTPMethod(Enum):
@@ -48,10 +49,13 @@ class WhispererEnv:
             LLMWhisperer's status API. Defaults to 30s
         MAX_POLLS: Total number of times to poll the status API.
             Set to -1 to poll indefinitely. Defaults to -1
+        STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API
+            on failure during polling. Defaults to 5.
     """
 
     POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL"
     MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS"
+    STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES"
 
 
 class WhispererConfig:
@@ -66,6 +70,7 @@ class WhispererConfig:
     GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius"
     FORCE_TEXT_PROCESSING = "force_text_processing"
     LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance"
+    LINE_SPLITTER_STRATEGY = "line_splitter_strategy"
     HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor"
     PAGES_TO_EXTRACT = "pages_to_extract"
     STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting"
@@ -74,7 +79,12 @@ class WhispererConfig:
     PAGE_SEPARATOR = "page_seperator"
     MARK_VERTICAL_LINES = "mark_vertical_lines"
     MARK_HORIZONTAL_LINES = "mark_horizontal_lines"
-
+    URL_IN_POST = "url_in_post"
+    TAG = "tag"
+    USE_WEBHOOK = "use_webhook"
+    WEBHOOK_METADATA = "webhook_metadata"
+    TEXT_ONLY = "text_only"
+    VERSION = "version"
 
 class WhisperStatus:
     """Values returned / used by /whisper-status endpoint."""
@@ -86,6 +96,7 @@ class WhisperStatus:
     # Used for async processing
     WHISPER_HASH = "whisper-hash"
     STATUS = "status"
+    WHISPER_HASH_V2 = "whisper_hash"
 
 
 class WhispererDefaults:
@@ -95,6 +106,7 @@ class WhispererDefaults:
     GAUSSIAN_BLUR_RADIUS = 0.0
     FORCE_TEXT_PROCESSING = False
     LINE_SPLITTER_TOLERANCE = 0.75
+    LINE_SPLITTER_STRATEGY = "left-priority"
     HORIZONTAL_STRETCH_FACTOR = 1.0
     POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
     MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
@@ -104,3 +116,7 @@ class WhispererDefaults:
     PAGE_SEPARATOR = "<<< >>>"
     MARK_VERTICAL_LINES = False
     MARK_HORIZONTAL_LINES = False
+    STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5))
+    URL_IN_POST = False
+    TAG = "default"
+    TEXT_ONLY = False
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py
new file mode 100644
index 00000000..11648674
--- /dev/null
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py
@@ -0,0 +1,81 @@
+import logging
+from typing import Any
+from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import (
+    Modes,
+    OutputModes,
+    WhispererConfig,
+    WhispererDefaults,
+)
+logger = logging.getLogger(__name__)
+
+
+class LLMWhispererHelper:
+
+    @staticmethod
+    def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]:
+        """Gets query params meant for /whisper endpoint.
+
+        The params is filled based on the configuration passed.
+
+        Returns:
+            dict[str, Any]: Query params
+        """
+        params = {
+            WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value),
+            WhispererConfig.OUTPUT_MODE: config.get(
+                WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value
+            ),
+            WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get(
+                WhispererConfig.LINE_SPLITTER_TOLERANCE,
+                WhispererDefaults.LINE_SPLITTER_TOLERANCE,
+            ),
+            WhispererConfig.LINE_SPLITTER_STRATEGY: config.get(
+                WhispererConfig.LINE_SPLITTER_STRATEGY,
+                WhispererDefaults.LINE_SPLITTER_STRATEGY,
+            ),
+            WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get(
+                WhispererConfig.HORIZONTAL_STRETCH_FACTOR,
+                WhispererDefaults.HORIZONTAL_STRETCH_FACTOR,
+            ),
+            WhispererConfig.PAGES_TO_EXTRACT: config.get(
+                WhispererConfig.PAGES_TO_EXTRACT,
+                WhispererDefaults.PAGES_TO_EXTRACT,
+            ),
+            WhispererConfig.MARK_VERTICAL_LINES: config.get(
+                WhispererConfig.MARK_VERTICAL_LINES,
+                WhispererDefaults.MARK_VERTICAL_LINES,
+            ),
+            WhispererConfig.MARK_HORIZONTAL_LINES: config.get(
+                WhispererConfig.MARK_HORIZONTAL_LINES,
+                WhispererDefaults.MARK_HORIZONTAL_LINES,
+            ),
+            WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST,
+            WhispererConfig.PAGE_SEPARATOR: config.get(
+                WhispererConfig.PAGE_SEPARATOR,
+                WhispererDefaults.PAGE_SEPARATOR,
+            ),
+            # Not providing default value to maintain legacy compatablity
+            # these are optional params and identifiers for audit
+            WhispererConfig.TAG: config.get(
+                WhispererConfig.TAG,
+                WhispererDefaults.TAG,
+            ),
+            WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK),
+            WhispererConfig.WEBHOOK_METADATA: config.get(
+                WhispererConfig.WEBHOOK_METADATA
+            ),
+        }
+        if params[WhispererConfig.MODE] == Modes.LOW_COST.value:
+            params.update(
+                {
+                    WhispererConfig.MEDIAN_FILTER_SIZE: config.get(
+                        WhispererConfig.MEDIAN_FILTER_SIZE,
+                        WhispererDefaults.MEDIAN_FILTER_SIZE,
+                    ),
+                    WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get(
+                        WhispererConfig.GAUSSIAN_BLUR_RADIUS,
+                        WhispererDefaults.GAUSSIAN_BLUR_RADIUS,
+                    ),
+                }
+            )
+        return params
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
index e753bed8..64dd9661 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
@@ -26,6 +26,7 @@
     WhispererHeader,
     WhisperStatus,
 )
+from unstract.sdk.adapters.x2text.llm_whisperer.src.helper import LLMWhispererHelper
 from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
 from unstract.sdk.constants import MimeType
 from unstract.sdk.file_storage import FileStorage, FileStorageProvider
@@ -34,25 +35,43 @@
 
 
 class LLMWhisperer(X2TextAdapter):
+    _version = "v2"
     def __init__(self, settings: dict[str, Any]):
         super().__init__("LLMWhisperer")
         self.config = settings
+        self.config["version"] = settings.get(WhispererConfig.VERSION, "v2")
+        LLMWhisperer._version = settings.get(WhispererConfig.VERSION, "v2")
+        
+    V1_NAME = "LLMWhisperer"
+    V1_DESCRIPTION = "LLMWhisperer X2Text"
+    V1_ICON = "/icons/adapter-icons/LLMWhisperer.png"
+
+    V2_ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
+    V2_NAME = "LLMWhisperer V2"
+    V2_DESCRIPTION = "LLMWhisperer V2 X2Text"
+    V2_ICON = "/icons/adapter-icons/LLMWhispererV2.png"
 
     @staticmethod
     def get_id() -> str:
-        return "llmwhisperer|0a1647f0-f65f-410d-843b-3d979c78350e"
-
-    @staticmethod
-    def get_name() -> str:
-        return "LLMWhisperer"
-
-    @staticmethod
-    def get_description() -> str:
-        return "LLMWhisperer X2Text"
-
-    @staticmethod
-    def get_icon() -> str:
-        return "/icons/adapter-icons/LLMWhisperer.png"
+        return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
+
+    @classmethod
+    def get_name(cls) -> str:
+        if cls._version == "v2":
+            return cls.V2_NAME
+        return cls.V1_NAME
+
+    @classmethod
+    def get_description(cls) -> str:
+        if cls._version == "v2":
+            return cls.V2_DESCRIPTION
+        return cls.V1_DESCRIPTION
+
+    @classmethod
+    def get_icon(cls) -> str:
+        if cls._version == "v2":
+            return cls.V2_ICON
+        return cls.V1_ICON
 
     @staticmethod
     def get_json_schema() -> str:
@@ -95,24 +114,23 @@ def _make_request(
         Returns:
             Response: Response from the request
         """
-        llm_whisperer_svc_url = (
-            f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}"
-        )
+        # Determine version and set appropriate URL
+        version = self.config.get("version", "v1")
+        base_url = (f"{self.config.get(WhispererConfig.URL)}/api/v2/{request_endpoint}"
+                    if version == "v2"
+                    else f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}"
+                    )
+
         if not headers:
             headers = self._get_request_headers()
 
         try:
             response: Response
             if request_method == HTTPMethod.GET:
-                response = requests.get(
-                    url=llm_whisperer_svc_url, headers=headers, params=params
-                )
+                response = requests.get(url=base_url, headers=headers, params=params)
             elif request_method == HTTPMethod.POST:
                 response = requests.post(
-                    url=llm_whisperer_svc_url,
-                    headers=headers,
-                    params=params,
-                    data=data,
+                    url=base_url, headers=headers, params=params, data=data
                 )
             else:
                 raise ExtractorError(f"Unsupported request method: {request_method}")
@@ -120,7 +138,7 @@ def _make_request(
         except ConnectionError as e:
             logger.error(f"Adapter error: {e}")
             raise ExtractorError(
-                "Unable to connect to LLMWhisperer service, please check the URL"
+                "Unable to connect to LLMWhisperer service, please check the URL",
             )
         except Timeout as e:
             msg = "Request to LLMWhisperer has timed out"
@@ -213,61 +231,87 @@ def test_connection(self) -> bool:
         return True
 
     def _check_status_until_ready(
-        self, whisper_hash: str, headers: dict[str, Any], params: dict[str, Any]
+
+        self,
+        whisper_hash: str = "",
+        headers: dict[str, Any] = None,
+        params: dict[str, Any] = None,
     ) -> WhisperStatus:
-        """Checks the extraction status by polling.
+        """Checks the extraction status by polling for both v1 and v2.
 
         Polls the /whisper-status endpoint in fixed intervals of
         env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times
         controlled by env: ADAPTER_LLMW_MAX_POLLS.
 
         Args:
-            whisper_hash (str): Identifier for the extraction,
-                returned by LLMWhisperer
+            version (str): Version of the LLMWhisperer API (either 'v1' or 'v2')
+            config (Optional[dict[str, Any]]): Configuration for v2 (None for v1)
+            whisper_hash (str): Identifier for the extraction, returned by LLMWhisperer
             headers (dict[str, Any]): Headers to pass for the status check
             params (dict[str, Any]): Params to pass for the status check
 
         Returns:
             WhisperStatus: Status of the extraction
         """
+        version = self.config['version']
         POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL
         MAX_POLLS = WhispererDefaults.MAX_POLLS
+        STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES if version == "v2" else 0
+        status_retry_count = 0
         request_count = 0
 
-        # Check status in fixed intervals upto max poll count.
         while True:
             request_count += 1
             logger.info(
-                f"Checking status with interval: {POLL_INTERVAL}s"
-                f", request count: {request_count} [max: {MAX_POLLS}]"
+                f"Checking status{' for whisper-hash ' if version == 'v2' else ''}"
+                f"'{whisper_hash}' with interval: {POLL_INTERVAL}s, request count: "
+                f"{request_count} [max: {MAX_POLLS}]"
             )
+
+            # Make request based on version
             status_response = self._make_request(
                 request_method=HTTPMethod.GET,
                 request_endpoint=WhispererEndpoint.STATUS,
                 headers=headers,
                 params=params,
             )
+
             if status_response.status_code == 200:
                 status_data = status_response.json()
                 status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN)
-                logger.info(f"Whisper status for {whisper_hash}: {status}")
+                logger.info(f"Whisper status for '{whisper_hash}': {status}")
                 if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]:
                     break
             else:
-                raise ExtractorError(
-                    "Error checking LLMWhisperer status: "
-                    f"{status_response.status_code} - {status_response.text}"
-                )
+                if version == "v2" and status_retry_count >= STATUS_RETRY_THRESHOLD:
+                    raise ExtractorError(
+                        f"Error checking LLMWhisperer status for whisper-hash "
+                        f"'{whisper_hash}': {status_response.text}"
+                    )
+                elif version == "v2":
+                    status_retry_count += 1
+                    logger.warning(
+                        f"Whisper status for '{whisper_hash}' failed "
+                        f"{status_retry_count} time(s), retrying... "
+                        f"[threshold: {STATUS_RETRY_THRESHOLD}]: {status_response.text}"
+                    )
+                else:  # v1 error handling
+                    raise ExtractorError(
+                        "Error checking LLMWhisperer status: "
+                        f"{status_response.status_code} - {status_response.text}"
+                    )
 
-            # Exit with error if max poll count is reached
             if request_count >= MAX_POLLS:
                 raise ExtractorError(
-                    "Unable to extract text after attempting" f" {request_count} times"
+                    f"Unable to extract text for whisper-hash '{whisper_hash}' "
+                    f"after attempting {request_count} times"
                 )
+
             time.sleep(POLL_INTERVAL)
 
         return status
 
+
     def _extract_async(self, whisper_hash: str) -> str:
         """Makes an async extraction with LLMWhisperer.
 
@@ -280,12 +324,16 @@ def _extract_async(self, whisper_hash: str) -> str:
             str: Extracted contents from the file
         """
         logger.info(f"Extracting async for whisper hash: {whisper_hash}")
-
+        version = self.config['version']
         headers: dict[str, Any] = self._get_request_headers()
-        params = {
+        params =({
             WhisperStatus.WHISPER_HASH: whisper_hash,
             WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON,
-        }
+        } if version == 'v1'
+        else {
+            WhisperStatus.WHISPER_HASH_V2: whisper_hash,
+            WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY,
+        })
 
         # Polls in fixed intervals and checks status
         self._check_status_until_ready(
@@ -312,22 +360,43 @@ def _send_whisper_request(
         fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
         enable_highlight: bool = False,
     ) -> requests.Response:
+        """Sends a whisper request for both v1 and v2.
+
+        Args:
+            version (str): Version of the LLMWhisperer API (either 'v1' or 'v2')
+            input_file_path (str): Path to the input file to be processed
+            fs (FileStorage): File storage object to read the file
+            enable_highlight (bool): Whether to enable highlight (only for v1)
+
+        Returns:
+            requests.Response: Response from the whisper request
+        """
+        version = self.config['version']
+        config = self.config
+        params = {}
         headers = self._get_request_headers()
+        if version == "v1":
+            params = self._get_whisper_params(enable_highlight)
+        elif version == "v2":
+            params = LLMWhispererHelper.get_whisperer_params(config)
+        else:
+            raise ValueError("Unsupported version. Only 'v1' and 'v2' are allowed.")
+
         headers["Content-Type"] = "application/octet-stream"
-        params = self._get_whisper_params(enable_highlight)
 
-        response: requests.Response
         try:
+            input_file_data = fs.read(input_file_path, "rb")
             response = self._make_request(
                 request_method=HTTPMethod.POST,
                 request_endpoint=WhispererEndpoint.WHISPER,
                 headers=headers,
                 params=params,
-                data=fs.read(path=input_file_path, mode="rb"),
+                data=input_file_data,
             )
         except OSError as e:
             logger.error(f"OS error while reading {input_file_path}: {e}")
             raise ExtractorError(str(e))
+
         return response
 
     def _extract_text_from_response(
@@ -337,10 +406,12 @@ def _extract_text_from_response(
         fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
     ) -> str:
         output_json = {}
+        version = self.config['version']
         if response.status_code == 200:
             output_json = response.json()
         elif response.status_code == 202:
-            whisper_hash = response.json().get(WhisperStatus.WHISPER_HASH)
+            whisper_hash_key = WhisperStatus.WHISPER_HASH_V2 if version == "v2" else WhisperStatus.WHISPER_HASH
+            whisper_hash = response.json().get(whisper_hash_key)
             output_json = self._extract_async(whisper_hash=whisper_hash)
         else:
             raise ExtractorError("Couldn't extract text from file")
@@ -348,7 +419,8 @@ def _extract_text_from_response(
             self._write_output_to_file(
                 output_json=output_json, output_file_path=Path(output_file_path), fs=fs
             )
-        return output_json.get("text", "")
+        output_key = "text" if version == "v1" else "result_text"
+        return output_json.get(output_key, "")
 
     def _write_output_to_file(
         self,
@@ -369,7 +441,9 @@ def _write_output_to_file(
             ExtractorError: If there is an error while writing the output file.
         """
         try:
-            text_output = output_json.get("text", "")
+            version = self.config['version']
+            output_key = "text" if version == "v1" else "result_text"
+            text_output = output_json.get(output_key, "")
             logger.info(f"Writing output to {output_file_path}")
             fs.write(
                 path=output_file_path,
@@ -423,22 +497,35 @@ def process(
                 Defaults to None.
 
         Returns:
-            str: Extracted text
+            TextExtractionResult: Extracted text along with metadata.
         """
+        if self.config['version'] == "v2":
+            # V2 logic
+            response: requests.Response = self._send_whisper_request(
+                input_file_path, fs=fs
+            )
+            response_text = response.text
+            response_dict = json.loads(response_text)
+            metadata = TextExtractionMetadata(
+                whisper_hash=response_dict.get(WhisperStatus.WHISPER_HASH_V2, "")
+            )
+        else:
+            # V1 logic
+            response: requests.Response = self._send_whisper_request(
+                input_file_path,
+                fs,
+                bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)),
+            )
 
-        response: requests.Response = self._send_whisper_request(
-            input_file_path,
-            fs,
-            bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)),
-        )
+            metadata = TextExtractionMetadata(
+                whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "")
+            )
 
-        metadata = TextExtractionMetadata(
-            whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "")
+        extracted_text = self._extract_text_from_response(
+            output_file_path, response, fs
         )
 
         return TextExtractionResult(
-            extracted_text=self._extract_text_from_response(
-                output_file_path, response, fs
-            ),
+            extracted_text=extracted_text,
             extraction_metadata=metadata,
         )
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
index 2bccb688..d4bde9ea 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
@@ -1,13 +1,23 @@
 {
-  "title": "LLMWhisperer v1 Text Extractor",
+  "title": "LLMWhisperer Text Extractor",
   "type": "object",
   "required": [
     "adapter_name",
     "unstract_key",
-    "url"
+    "url",
+    "version"
   ],
-  "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.",
   "properties": {
+    "version": {
+      "type": "string",
+      "title": "Version",
+      "enum": [
+        "v1",
+        "v2"
+      ],
+      "default": "v2",
+      "description": "Select the version of LLMWhisperer to use."
+    },
     "adapter_name": {
       "type": "string",
       "title": "Name",
@@ -18,120 +28,166 @@
       "type": "string",
       "title": "URL",
       "format": "uri",
-      "default": "https://llmwhisperer-api.unstract.com",
-      "description": "Provide the URL of the LLMWhisperer service. Please note that this version of LLMWhisperer is deprecated."
+      "default": "https://llmwhisperer-api.us-central.unstract.com",
+      "description": "Provide the URL of the LLMWhisperer service."
     },
     "unstract_key": {
       "type": "string",
       "title": "Unstract Key",
       "format": "password",
-      "description": "API key obtained from the [Unstract developer portal](https://unstract-api-resource.developer.azure-api.net)"
-    },
-    "mode": {
-      "type": "string",
-      "title": "Mode",
-      "enum": [
-        "native_text",
-        "low_cost",
-        "high_quality",
-        "form"
-      ],
-      "default": "form",
-      "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)"
-    },
-    "output_mode": {
-      "type": "string",
-      "title": "Output Mode",
-      "enum": [
-        "line-printer",
-        "dump-text",
-        "text"
-      ],
-      "default": "line-printer",
-      "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
-    },
-
-    "line_splitter_tolerance": {
-      "type": "number",
-      "title": "Line Splitter Tolerance",
-      "default": 0.4,
-      "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi column layout with text in each column that is not aligned."
-    },
-    "horizontal_stretch_factor": {
-      "type": "number",
-      "title": "Horizontal Stretch Factor",
-      "default": 1.0,
-      "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other."
-    },
-    "pages_to_extract": {
-      "type": "string",
-      "title": "Page number(s) or range to extract",
-      "default": "",
-      "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$",
-      "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages."
-    },
-    "page_seperator": {
-      "type": "string",
-      "title": "Page separator",
-      "default": "<<< >>>",
-      "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator."
+      "description": "API key obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)"
     }
   },
-  "if": {
-    "anyOf": [
-      {
+  "allOf": [
+    {
+      "if": {
         "properties": {
-          "mode": {
-            "const": "low_cost"
+          "version": {
+            "const": "v1"
           }
         }
       },
-      {
+      "then": {
+        "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.",
         "properties": {
           "mode": {
-            "const": "high_quality"
+            "type": "string",
+            "title": "Mode",
+            "enum": [
+              "native_text",
+              "low_cost",
+              "high_quality",
+              "form"
+            ],
+            "default": "form",
+            "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)"
+          },
+          "output_mode": {
+            "type": "string",
+            "title": "Output Mode",
+            "enum": [
+              "line-printer",
+              "dump-text",
+              "text"
+            ],
+            "default": "line-printer",
+            "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
+          },
+          "line_splitter_tolerance": {
+            "type": "number",
+            "title": "Line Splitter Tolerance",
+            "default": 0.4,
+            "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi-column layout with text in each column that is not aligned."
+          },
+          "horizontal_stretch_factor": {
+            "type": "number",
+            "title": "Horizontal Stretch Factor",
+            "default": 1.0,
+            "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi-column text merges with each other."
+          },
+          "pages_to_extract": {
+            "type": "string",
+            "title": "Page number(s) or range to extract",
+            "default": "",
+            "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$",
+            "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages."
+          },
+          "page_seperator": {
+            "type": "string",
+            "title": "Page separator",
+            "default": "<<< >>>",
+            "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator."
+          }
+        },
+        "required": [
+          "mode",
+          "output_mode"
+        ]
+      }
+    },
+    {
+      "if": {
+        "properties": {
+          "version": {
+            "const": "v2"
           }
         }
       },
-      {
+      "then": {
         "properties": {
           "mode": {
-            "const": "form"
+            "type": "string",
+            "title": "Mode",
+            "enum": [
+              "native_text",
+              "low_cost",
+              "high_quality",
+              "form"
+            ],
+            "default": "form",
+            "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)."
+          },
+          "output_mode": {
+            "type": "string",
+            "title": "Output Mode",
+            "enum": [
+              "layout_preserving",
+              "text"
+            ],
+            "default": "layout_preserving",
+            "description": "Output format, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
+          },
+          "line_splitter_tolerance": {
+            "type": "number",
+            "title": "Line Splitter Tolerance",
+            "default": 0.4,
+            "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height."
+          },
+          "line_splitter_strategy": {
+            "type": "string",
+            "title": "Line Splitter Strategy",
+            "default": "left-priority",
+            "description": "An advanced option for customizing the line splitting process."
+          },
+          "horizontal_stretch_factor": {
+            "type": "number",
+            "title": "Horizontal Stretch Factor",
+            "default": 1.0,
+            "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi-column text merges with each other."
+          },
+          "pages_to_extract": {
+            "type": "string",
+            "title": "Page number(s) or range to extract",
+            "default": "",
+            "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$",
+            "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages."
+          },
+          "page_seperator": {
+            "type": "string",
+            "title": "Page separator",
+            "default": "<<<",
+            "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator."
+          },
+          "tag": {
+            "type": "string",
+            "title": "Tag",
+            "default": "default",
+            "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross-referencing in usage reports."
+          },
+          "use_webhook": {
+            "type": "string",
+            "title": "Webhook",
+            "default": "",
+            "description": "The webhook's name which should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint."
+          },
+          "webhook_metadata": {
+            "type": "string",
+            "title": "Webhook Metadata",
+            "default": "",
+            "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint."
           }
         }
       }
-    ]
-  },
-  "then": {
-    "properties": {
-      "median_filter_size": {
-        "type": "integer",
-        "title": "Median Filter Size",
-        "default": 0,
-        "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
-      },
-      "gaussian_blur_radius": {
-        "type": "number",
-        "title": "Gaussian Blur Radius",
-        "default": 0.0,
-        "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
-      },
-      "mark_vertical_lines": {
-        "type": "boolean",
-        "title": "Mark Vertical Lines",
-        "default": false,
-        "description": "Detect vertical lines in the document and replicate the same using text (using \"|\" symbol). Use this for displaying tables with borders."
-      },
-      "mark_horizontal_lines": {
-        "type": "boolean",
-        "title": "Mark Horizontal Lines",
-        "default": false,
-        "description": "Detect horizontal lines in the document and replicate the same using text (using \"-\" symbol). Use this for displaying tables with borders and other horizontal serperators found in the document."
-      }
-    },
-    "required": [
-      "median_filter_size",
-      "gaussian_blur_radius"
-    ]
-  }
+    }
+  ]
 }

From b771e943f103313fb38eccaaccb0ace2fd2f73e3 Mon Sep 17 00:00:00 2001
From: jagadeeswaran-zipstack <jagadeeswaran@zipstack.com>
Date: Mon, 13 Jan 2025 16:39:47 +0530
Subject: [PATCH 2/5] updated read me and unified adapter names

---
 .../adapters/x2text/llm_whisperer/README.md   |  56 ++++++++-
 .../x2text/llm_whisperer/src/llm_whisperer.py |  43 +++----
 .../llm_whisperer/src/static/json_schema.json | 106 +++++++++++++++++-
 3 files changed, 170 insertions(+), 35 deletions(-)

diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md
index 0c1a9ea1..484b5979 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/README.md
@@ -4,7 +4,55 @@
 
 The below env variables are resolved by LLMWhisperer adapter
 
-| Variable                     | Description                                                                                  |
-| ---------------------------- | -------------------------------------------------------------------------------------------- |
-| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s            |
-| `ADAPTER_LLMW_MAX_POLLS`     | Total number of times to poll the status API. Defaults to 30                                 |
+| Variable                     | Description                                                                       |
+| ---------------------------- | --------------------------------------------------------------------------------- |
+| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s |
+| `ADAPTER_LLMW_MAX_POLLS`     | Total number of times to poll the status API. Defaults to 30                      |
+
+---
+
+## id: llm_whisperer_apis_changelog
+
+# Changelog
+
+## Version 2.0.0
+
+:::warning
+This version of the API is not backward compatible with the previous version.
+:::
+
+### API endpoint
+
+- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2`
+
+### Global change in parameter naming
+
+- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency.
+
+### Whisper parameters
+
+#### Added
+
+- `mode` (str, optional): The processing mode.
+- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document.
+- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document.
+- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process.
+- `lang` (str, optional): The language of the document.
+- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes.
+- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes.
+- `use_webhook` (str, optional): The name of the webhook to call after the document is processed.
+- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed.
+
+#### Removed
+
+- `timeout` (int, optional): The timeout for API requests. _There is no sync mode now. All requests are async._
+- `force_text_processing` (bool, optional): Whether to force text processing. _This is feature is removed_
+- `ocr_provider` (str, optional): The OCR provider to use. _This is superseded by `mode`_
+- `processing_mode` (str, optional): The processing mode. _This is superseded by `mode`_
+- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. _Feature is removed. Data still available and set back when retrieve is called_
+
+### New features
+
+#### Webhooks
+
+- Added support for webhooks. You can now register a webhook and use it to receive the processed document.
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
index 64dd9661..7a230d17 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
@@ -42,36 +42,27 @@ def __init__(self, settings: dict[str, Any]):
         self.config["version"] = settings.get(WhispererConfig.VERSION, "v2")
         LLMWhisperer._version = settings.get(WhispererConfig.VERSION, "v2")
         
-    V1_NAME = "LLMWhisperer"
-    V1_DESCRIPTION = "LLMWhisperer X2Text"
-    V1_ICON = "/icons/adapter-icons/LLMWhisperer.png"
 
-    V2_ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
-    V2_NAME = "LLMWhisperer V2"
-    V2_DESCRIPTION = "LLMWhisperer V2 X2Text"
-    V2_ICON = "/icons/adapter-icons/LLMWhispererV2.png"
+    ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
+    NAME = "LLMWhisperer V2"
+    DESCRIPTION = "LLMWhisperer V2 X2Text"
+    ICON = "/icons/adapter-icons/LLMWhispererV2.png"
 
     @staticmethod
     def get_id() -> str:
-        return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
-
-    @classmethod
-    def get_name(cls) -> str:
-        if cls._version == "v2":
-            return cls.V2_NAME
-        return cls.V1_NAME
-
-    @classmethod
-    def get_description(cls) -> str:
-        if cls._version == "v2":
-            return cls.V2_DESCRIPTION
-        return cls.V1_DESCRIPTION
-
-    @classmethod
-    def get_icon(cls) -> str:
-        if cls._version == "v2":
-            return cls.V2_ICON
-        return cls.V1_ICON
+        return LLMWhisperer.ID
+
+    @staticmethod
+    def get_name() -> str:
+        return LLMWhisperer.NAME
+
+    @staticmethod
+    def get_description() -> str:
+        return LLMWhisperer.DESCRIPTION
+
+    @staticmethod
+    def get_icon() -> str:
+        return LLMWhisperer.ICON
 
     @staticmethod
     def get_json_schema() -> str:
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
index d4bde9ea..d0316d59 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
@@ -99,10 +99,63 @@
             "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator."
           }
         },
-        "required": [
-          "mode",
-          "output_mode"
-        ]
+        "if": {
+          "anyOf": [
+            {
+              "properties": {
+                "mode": {
+                  "const": "low_cost"
+                }
+              }
+            },
+            {
+              "properties": {
+                "mode": {
+                  "const": "high_quality"
+                }
+              }
+            },
+            {
+              "properties": {
+                "mode": {
+                  "const": "form"
+                }
+              }
+            }
+          ]
+        },
+        "then": {
+          "properties": {
+            "median_filter_size": {
+              "type": "integer",
+              "title": "Median Filter Size",
+              "default": 0,
+              "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
+            },
+            "gaussian_blur_radius": {
+              "type": "number",
+              "title": "Gaussian Blur Radius",
+              "default": 0.0,
+              "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
+            },
+            "mark_vertical_lines": {
+              "type": "boolean",
+              "title": "Mark Vertical Lines",
+              "default": false,
+              "description": "Detect vertical lines in the document and replicate the same using text (using \"|\" symbol). Use this for displaying tables with borders."
+            },
+            "mark_horizontal_lines": {
+              "type": "boolean",
+              "title": "Mark Horizontal Lines",
+              "default": false,
+              "description": "Detect horizontal lines in the document and replicate the same using text (using \"-\" symbol). Use this for displaying tables with borders and other horizontal serperators found in the document."
+            }
+          },
+          "required": [
+            "median_filter_size",
+            "gaussian_blur_radius"
+          ]
+        }
       }
     },
     {
@@ -168,6 +221,18 @@
             "default": "<<<",
             "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator."
           },
+          "mark_vertical_lines": {
+            "type": "boolean",
+            "title": "Mark vertical lines",
+            "default": false,
+            "description": "States whether to reproduce vertical lines in the document."
+          },
+          "mark_horizontal_lines": {
+            "type": "boolean",
+            "title": "Mark horizontal lines",
+            "default": false,
+            "description": "States whether to reproduce horizontal lines in the document."
+          },
           "tag": {
             "type": "string",
             "title": "Tag",
@@ -186,8 +251,39 @@
             "default": "",
             "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint."
           }
+        },
+        "if": {
+          "anyOf": [
+            {
+              "properties": {
+                "mode": {
+                  "const": "low_cost"
+                }
+              }
+            }
+          ]
+        },
+        "then": {
+          "properties": {
+            "median_filter_size": {
+              "type": "integer",
+              "title": "Median Filter Size",
+              "default": 0,
+              "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
+            },
+            "gaussian_blur_radius": {
+              "type": "number",
+              "title": "Gaussian Blur Radius",
+              "default": 0.0,
+              "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
+            }
+          },
+          "required": [
+            "median_filter_size",
+            "gaussian_blur_radius"
+          ]
         }
       }
     }
   ]
-}
+}
\ No newline at end of file

From 5243ab8e7151c922762941c447092fb73dab30b6 Mon Sep 17 00:00:00 2001
From: jagadeeswaran-zipstack <jagadeeswaran@zipstack.com>
Date: Mon, 13 Jan 2025 16:50:57 +0530
Subject: [PATCH 3/5] env seperation for v1 and v2

---
 .../x2text/llm_whisperer/src/__init__.py      |   2 +-
 .../x2text/llm_whisperer/src/constants.py     |   4 +
 .../x2text/llm_whisperer/src/llm_whisperer.py |   4 +-
 .../x2text/llm_whisperer_v2/README.md         |  58 ---
 .../x2text/llm_whisperer_v2/pyproject.toml    |  25 --
 .../x2text/llm_whisperer_v2/src/__init__.py   |   9 -
 .../x2text/llm_whisperer_v2/src/constants.py  | 107 -----
 .../x2text/llm_whisperer_v2/src/helper.py     | 388 ------------------
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  |  93 -----
 .../src/static/json_schema.json               | 144 -------
 10 files changed, 7 insertions(+), 827 deletions(-)
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
 delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json

diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py
index ba216498..c4f02191 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py
@@ -2,7 +2,7 @@
 
 metadata = {
     "name": LLMWhisperer.__name__,
-    "version": "1.0.0",
+    "version": "2.0.0",
     "adapter": LLMWhisperer,
     "description": "LLMWhisperer X2Text adapter",
     "is_active": True,
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
index 87b11a49..d0c60286 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
@@ -55,6 +55,8 @@ class WhispererEnv:
 
     POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL"
     MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS"
+    POLL_INTERVAL_V2 = "ADAPTER_LLMW_POLL_INTERVAL_V2"
+    MAX_POLLS_V2 = "ADAPTER_LLMW_MAX_POLLS_V2"
     STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES"
 
 
@@ -110,6 +112,8 @@ class WhispererDefaults:
     HORIZONTAL_STRETCH_FACTOR = 1.0
     POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
     MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
+    POLL_INTERVAL_V2 = int(os.getenv(WhispererEnv.POLL_INTERVAL_V2, 30))
+    MAX_POLLS_V2 = int(os.getenv(WhispererEnv.MAX_POLLS_V2, 30))
     PAGES_TO_EXTRACT = ""
     ADD_LINE_NOS = True
     OUTPUT_JSON = True
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
index 7a230d17..59b61a1d 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
@@ -245,8 +245,8 @@ def _check_status_until_ready(
             WhisperStatus: Status of the extraction
         """
         version = self.config['version']
-        POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL
-        MAX_POLLS = WhispererDefaults.MAX_POLLS
+        POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL_V2 if version == "v2" else WhispererDefaults.POLL_INTERVAL
+        MAX_POLLS = WhispererDefaults.MAX_POLLS_V2 if version == "v2" else WhispererDefaults.MAX_POLLS
         STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES if version == "v2" else 0
         status_retry_count = 0
         request_count = 0
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md
deleted file mode 100644
index f33810b3..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Unstract LLMWWhisperer v2 X2Text Adapter
-
-## Env variables
-
-The below env variables are resolved by LLMWhisperer adapter
-
-| Variable                     | Description                                                                                  |
-| ---------------------------- | -------------------------------------------------------------------------------------------- |
-| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s            |
-| `ADAPTER_LLMW_MAX_POLLS`     | Total number of times to poll the status API. Defaults to 30                                 |
-
-
----
-id: llm_whisperer_apis_changelog
----
-
-# Changelog
-
-## Version 2.0.0
-
-:::warning
-This version of the API is not backward compatible with the previous version.
-:::
-
-### API endpoint
-
-- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2`
-
-### Global change in parameter naming
-
-- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency. 
-
-### Whisper parameters
-
-#### Added
-- `mode` (str, optional): The processing mode. 
-- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document.
-- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document. 
-- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process. 
-- `lang` (str, optional): The language of the document. 
-- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes.
-- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes.
-- `use_webhook` (str, optional): The name of the webhook to call after the document is processed.
-- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed.
-
-#### Removed
-- `timeout` (int, optional): The timeout for API requests. *There is no sync mode now. All requests are async.*
-- `force_text_processing` (bool, optional): Whether to force text processing. *This is feature is removed*
-- `ocr_provider` (str, optional): The OCR provider to use. *This is superseded by `mode`*
-- `processing_mode` (str, optional): The processing mode. *This is superseded by `mode`*
-- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. *Feature is removed. Data still available and set back when retrieve is called*
-
-
-### New features
-
-#### Webhooks
-
-- Added support for webhooks. You can now register a webhook and use it to receive the processed document.
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml
deleted file mode 100644
index bf7ad3a4..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-[build-system]
-requires = ["pdm-backend"]
-build-backend = "pdm.backend"
-
-
-[project]
-name = "unstract-llm_whisperer-x2text-v2"
-version = "0.0.1"
-description = "V2 of LLMWhisperer X2Text Adapter"
-authors = [
-    {name = "Zipstack Inc.", email = "devsupport@zipstack.com"},
-]
-dependencies = [
-]
-requires-python = ">=3.9"
-readme = "README.md"
-classifiers = [
-  "Programming Language :: Python"
-]
-license = {text = "MIT"}
-
-[tool.pdm.build]
-includes = ["src"]
-package-dir = "src"
-# source-includes = ["tests"]
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py
deleted file mode 100644
index 14240c6a..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .llm_whisperer_v2 import LLMWhispererV2
-
-metadata = {
-    "name": LLMWhispererV2.__name__,
-    "version": "1.0.0",
-    "adapter": LLMWhispererV2,
-    "description": "LLMWhispererV2 X2Text adapter",
-    "is_active": True,
-}
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py
deleted file mode 100644
index 7e2d7dcf..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import os
-from enum import Enum
-
-
-class Modes(Enum):
-    NATIVE_TEXT = "native_text"
-    LOW_COST = "low_cost"
-    HIGH_QUALITY = "high_quality"
-    FORM = "form"
-
-
-class OutputModes(Enum):
-    LAYOUT_PRESERVING = "layout_preserving"
-    TEXT = "text"
-
-
-class HTTPMethod(Enum):
-    GET = "GET"
-    POST = "POST"
-
-
-class WhispererHeader:
-    UNSTRACT_KEY = "unstract-key"
-
-
-class WhispererEndpoint:
-    """Endpoints available at LLMWhisperer service."""
-
-    TEST_CONNECTION = "test-connection"
-    WHISPER = "whisper"
-    STATUS = "whisper-status"
-    RETRIEVE = "whisper-retrieve"
-
-
-class WhispererEnv:
-    """Env variables for LLMWhisperer.
-
-    Can be used to alter behaviour at runtime.
-
-    Attributes:
-        POLL_INTERVAL: Time in seconds to wait before polling
-            LLMWhisperer's status API. Defaults to 30s
-        MAX_POLLS: Total number of times to poll the status API.
-            Set to -1 to poll indefinitely. Defaults to -1
-        STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API
-            on failure during polling. Defaults to 5.
-    """
-
-    POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL"
-    MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS"
-    STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES"
-
-
-class WhispererConfig:
-    """Dictionary keys used to configure LLMWhisperer service."""
-
-    URL = "url"
-    MODE = "mode"
-    OUTPUT_MODE = "output_mode"
-    UNSTRACT_KEY = "unstract_key"
-    MEDIAN_FILTER_SIZE = "median_filter_size"
-    GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius"
-    LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance"
-    LINE_SPLITTER_STRATEGY = "line_splitter_strategy"
-    HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor"
-    PAGES_TO_EXTRACT = "pages_to_extract"
-    MARK_VERTICAL_LINES = "mark_vertical_lines"
-    MARK_HORIZONTAL_LINES = "mark_horizontal_lines"
-    PAGE_SEPARATOR = "page_seperator"
-    URL_IN_POST = "url_in_post"
-    TAG = "tag"
-    USE_WEBHOOK = "use_webhook"
-    WEBHOOK_METADATA = "webhook_metadata"
-    TEXT_ONLY = "text_only"
-
-
-class WhisperStatus:
-    """Values returned / used by /whisper-status endpoint."""
-
-    PROCESSING = "processing"
-    PROCESSED = "processed"
-    DELIVERED = "delivered"
-    UNKNOWN = "unknown"
-    # Used for async processing
-    WHISPER_HASH = "whisper_hash"
-    STATUS = "status"
-
-
-class WhispererDefaults:
-    """Defaults meant for LLMWhisperer."""
-
-    MEDIAN_FILTER_SIZE = 0
-    GAUSSIAN_BLUR_RADIUS = 0.0
-    FORCE_TEXT_PROCESSING = False
-    LINE_SPLITTER_TOLERANCE = 0.75
-    LINE_SPLITTER_STRATEGY = "left-priority"
-    HORIZONTAL_STRETCH_FACTOR = 1.0
-    POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
-    MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
-    STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5))
-    PAGES_TO_EXTRACT = ""
-    PAGE_SEPARATOR = "<<<"
-    MARK_VERTICAL_LINES = False
-    MARK_HORIZONTAL_LINES = False
-    URL_IN_POST = False
-    TAG = "default"
-    TEXT_ONLY = False
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py
deleted file mode 100644
index dfea856c..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import json
-import logging
-import time
-from pathlib import Path
-from typing import Any, Optional
-
-import requests
-from requests import Response
-from requests.exceptions import ConnectionError, HTTPError, Timeout
-
-from unstract.sdk.adapters.exceptions import ExtractorError
-from unstract.sdk.adapters.utils import AdapterUtils
-from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import (
-    HTTPMethod,
-    Modes,
-    OutputModes,
-    WhispererConfig,
-    WhispererDefaults,
-    WhispererEndpoint,
-    WhispererHeader,
-    WhisperStatus,
-)
-from unstract.sdk.constants import MimeType
-from unstract.sdk.file_storage import FileStorage, FileStorageProvider
-
-logger = logging.getLogger(__name__)
-
-
-class LLMWhispererHelper:
-    @staticmethod
-    def get_request_headers(config: dict[str, Any]) -> dict[str, Any]:
-        """Obtains the request headers to authenticate with LLMWhisperer.
-
-        Returns:
-            str: Request headers
-        """
-        return {
-            "accept": MimeType.JSON,
-            WhispererHeader.UNSTRACT_KEY: config.get(WhispererConfig.UNSTRACT_KEY),
-        }
-
-    @staticmethod
-    def make_request(
-        config: dict[str, Any],
-        request_method: HTTPMethod,
-        request_endpoint: str,
-        headers: Optional[dict[str, Any]] = None,
-        params: Optional[dict[str, Any]] = None,
-        data: Optional[Any] = None,
-    ) -> Response:
-        """Makes a request to LLMWhisperer service.
-
-        Args:
-            request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST
-            request_endpoint (str): LLMWhisperer endpoint to hit
-            headers (Optional[dict[str, Any]], optional): Headers to pass.
-                Defaults to None.
-            params (Optional[dict[str, Any]], optional): Query params to pass.
-                Defaults to None.
-            data (Optional[Any], optional): Data to pass in case of POST.
-                Defaults to None.
-
-        Returns:
-            Response: Response from the request
-        """
-        llm_whisperer_svc_url = (
-            f"{config.get(WhispererConfig.URL)}" f"/api/v2/{request_endpoint}"
-        )
-        if not headers:
-            headers = LLMWhispererHelper.get_request_headers(config=config)
-
-        try:
-            response: Response
-            if request_method == HTTPMethod.GET:
-                response = requests.get(
-                    url=llm_whisperer_svc_url, headers=headers, params=params
-                )
-            elif request_method == HTTPMethod.POST:
-                response = requests.post(
-                    url=llm_whisperer_svc_url,
-                    headers=headers,
-                    params=params,
-                    data=data,
-                )
-            else:
-                raise ExtractorError(
-                    f"Unsupported request method: {request_method}", status_code=500
-                )
-            response.raise_for_status()
-        except ConnectionError as e:
-            logger.error(f"Adapter error: {e}")
-            raise ExtractorError(
-                "Unable to connect to LLMWhisperer service, please check the URL",
-                actual_err=e,
-                status_code=503,
-            )
-        except Timeout as e:
-            msg = "Request to LLMWhisperer has timed out"
-            logger.error(f"{msg}: {e}")
-            raise ExtractorError(msg, actual_err=e, status_code=504)
-        except HTTPError as e:
-            logger.error(f"Adapter error: {e}")
-            default_err = "Error while calling the LLMWhisperer service"
-            msg = AdapterUtils.get_msg_from_request_exc(
-                err=e, message_key="message", default_err=default_err
-            )
-            raise ExtractorError(msg)
-        return response
-
-    @staticmethod
-    def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]:
-        """Gets query params meant for /whisper endpoint.
-
-        The params is filled based on the configuration passed.
-
-        Returns:
-            dict[str, Any]: Query params
-        """
-        params = {
-            WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value),
-            WhispererConfig.OUTPUT_MODE: config.get(
-                WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value
-            ),
-            WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get(
-                WhispererConfig.LINE_SPLITTER_TOLERANCE,
-                WhispererDefaults.LINE_SPLITTER_TOLERANCE,
-            ),
-            WhispererConfig.LINE_SPLITTER_STRATEGY: config.get(
-                WhispererConfig.LINE_SPLITTER_STRATEGY,
-                WhispererDefaults.LINE_SPLITTER_STRATEGY,
-            ),
-            WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get(
-                WhispererConfig.HORIZONTAL_STRETCH_FACTOR,
-                WhispererDefaults.HORIZONTAL_STRETCH_FACTOR,
-            ),
-            WhispererConfig.PAGES_TO_EXTRACT: config.get(
-                WhispererConfig.PAGES_TO_EXTRACT,
-                WhispererDefaults.PAGES_TO_EXTRACT,
-            ),
-            WhispererConfig.MARK_VERTICAL_LINES: config.get(
-                WhispererConfig.MARK_VERTICAL_LINES,
-                WhispererDefaults.MARK_VERTICAL_LINES,
-            ),
-            WhispererConfig.MARK_HORIZONTAL_LINES: config.get(
-                WhispererConfig.MARK_HORIZONTAL_LINES,
-                WhispererDefaults.MARK_HORIZONTAL_LINES,
-            ),
-            WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST,
-            WhispererConfig.PAGE_SEPARATOR: config.get(
-                WhispererConfig.PAGE_SEPARATOR,
-                WhispererDefaults.PAGE_SEPARATOR,
-            ),
-            # Not providing default value to maintain legacy compatablity
-            # these are optional params and identifiers for audit
-            WhispererConfig.TAG: config.get(
-                WhispererConfig.TAG,
-                WhispererDefaults.TAG,
-            ),
-            WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK),
-            WhispererConfig.WEBHOOK_METADATA: config.get(
-                WhispererConfig.WEBHOOK_METADATA
-            ),
-        }
-        if params[WhispererConfig.MODE] == Modes.LOW_COST.value:
-            params.update(
-                {
-                    WhispererConfig.MEDIAN_FILTER_SIZE: config.get(
-                        WhispererConfig.MEDIAN_FILTER_SIZE,
-                        WhispererDefaults.MEDIAN_FILTER_SIZE,
-                    ),
-                    WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get(
-                        WhispererConfig.GAUSSIAN_BLUR_RADIUS,
-                        WhispererDefaults.GAUSSIAN_BLUR_RADIUS,
-                    ),
-                }
-            )
-        return params
-
-    @staticmethod
-    def check_status_until_ready(
-        config: dict[str, Any],
-        whisper_hash: str,
-        headers: dict[str, Any],
-        params: dict[str, Any],
-    ) -> WhisperStatus:
-        """Checks the extraction status by polling.
-
-        Polls the /whisper-status endpoint in fixed intervals of
-        env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times
-        controlled by env: ADAPTER_LLMW_MAX_POLLS.
-
-        Args:
-            whisper_hash (str): Identifier for the extraction,
-                returned by LLMWhisperer
-            headers (dict[str, Any]): Headers to pass for the status check
-            params (dict[str, Any]): Params to pass for the status check
-
-        Returns:
-            WhisperStatus: Status of the extraction
-        """
-        POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL
-        MAX_POLLS = WhispererDefaults.MAX_POLLS
-        STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES
-        status_retry_count = 0
-        request_count = 0
-
-        # Check status in fixed intervals upto max poll count.
-        while True:
-            request_count += 1
-            logger.info(
-                f"Checking status for whisper-hash '{whisper_hash}' with interval: "
-                f"{POLL_INTERVAL}s, request count: {request_count} [max: {MAX_POLLS}]"
-            )
-            status_response = LLMWhispererHelper.make_request(
-                config=config,
-                request_method=HTTPMethod.GET,
-                request_endpoint=WhispererEndpoint.STATUS,
-                headers=headers,
-                params=params,
-            )
-            if status_response.status_code == 200:
-                status_data = status_response.json()
-                status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN)
-                logger.info(f"Whisper status for '{whisper_hash}': {status}")
-                if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]:
-                    break
-            else:
-                if status_retry_count >= STATUS_RETRY_THRESHOLD:
-                    raise ExtractorError(
-                        f"Error checking LLMWhisperer status for whisper-hash "
-                        f"'{whisper_hash}': {status_response.text}"
-                    )
-                else:
-                    status_retry_count += 1
-                    logger.warning(
-                        f"Whisper status for '{whisper_hash}' failed "
-                        f"{status_retry_count} time(s), retrying... "
-                        f"[threshold: {STATUS_RETRY_THRESHOLD}]: {status_response.text}"
-                    )
-
-            # Exit with error if max poll count is reached
-            if request_count >= MAX_POLLS:
-                raise ExtractorError(
-                    f"Unable to extract text for whisper-hash '{whisper_hash}' "
-                    f"after attempting {request_count} times"
-                )
-            time.sleep(POLL_INTERVAL)
-
-        return status
-
-    @staticmethod
-    def extract_async(config: dict[str, Any], whisper_hash: str) -> dict[Any, Any]:
-        """Makes an async extraction with LLMWhisperer.
-
-        Polls and checks the status first before proceeding to retrieve once.
-
-        Args:
-            whisper_hash (str): Identifier of the extraction
-
-        Returns:
-            str: Extracted contents from the file
-        """
-        logger.info(f"Extracting async for whisper hash: {whisper_hash}")
-
-        headers: dict[str, Any] = LLMWhispererHelper.get_request_headers(config)
-        params = {
-            WhisperStatus.WHISPER_HASH: whisper_hash,
-            WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY,
-        }
-
-        # Polls in fixed intervals and checks status
-        LLMWhispererHelper.check_status_until_ready(
-            config=config, whisper_hash=whisper_hash, headers=headers, params=params
-        )
-
-        retrieve_response = LLMWhispererHelper.make_request(
-            config=config,
-            request_method=HTTPMethod.GET,
-            request_endpoint=WhispererEndpoint.RETRIEVE,
-            headers=headers,
-            params=params,
-        )
-        if retrieve_response.status_code == 200:
-            return retrieve_response.json()
-        else:
-            raise ExtractorError(
-                "Error retrieving from LLMWhisperer: "
-                f"{retrieve_response.status_code} - {retrieve_response.text}"
-            )
-
-    @staticmethod
-    def send_whisper_request(
-        input_file_path: str,
-        config: dict[str, Any],
-        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
-    ) -> requests.Response:
-        headers = LLMWhispererHelper.get_request_headers(config)
-        headers["Content-Type"] = "application/octet-stream"
-        params = LLMWhispererHelper.get_whisperer_params(config)
-
-        response: requests.Response
-        try:
-            input_file_data = fs.read(input_file_path, "rb")
-            response = LLMWhispererHelper.make_request(
-                config=config,
-                request_method=HTTPMethod.POST,
-                request_endpoint=WhispererEndpoint.WHISPER,
-                headers=headers,
-                params=params,
-                data=input_file_data,
-            )
-        except OSError as e:
-            logger.error(f"OS error while reading {input_file_path}: {e}")
-            raise ExtractorError(str(e))
-        return response
-
-    @staticmethod
-    def extract_text_from_response(
-        config: dict[str, Any],
-        output_file_path: Optional[str],
-        response_dict: dict[str, Any],
-        response: Response,
-        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
-    ) -> str:
-        output_json = {}
-        if response.status_code == 200:
-            output_json = response.json()
-        elif response.status_code == 202:
-            whisper_hash = response_dict.get(WhisperStatus.WHISPER_HASH)
-            output_json = LLMWhispererHelper.extract_async(
-                config=config, whisper_hash=whisper_hash
-            )
-        else:
-            raise ExtractorError("Couldn't extract text from file")
-        if output_file_path:
-            LLMWhispererHelper.write_output_to_file(
-                output_json=output_json,
-                output_file_path=Path(output_file_path),
-                fs=fs,
-            )
-        return output_json.get("result_text", "")
-
-    @staticmethod
-    def write_output_to_file(
-        output_json: dict,
-        output_file_path: Path,
-        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
-    ) -> None:
-        """Writes the extracted text and metadata to the specified output file
-        and metadata file.
-
-        Args:
-            output_json (dict): The dictionary containing the extracted data,
-                with "text" as the key for the main content.
-            output_file_path (Path): The file path where the extracted text
-                should be written.
-
-        Raises:
-            ExtractorError: If there is an error while writing the output file.
-        """
-        try:
-            text_output = output_json.get("result_text", "")
-            logger.info(f"Writing output to {output_file_path}")
-            fs.write(
-                path=output_file_path, mode="w", data=text_output, encoding="utf-8"
-            )
-        except Exception as e:
-            logger.error(f"Error while writing {output_file_path}: {e}")
-            raise ExtractorError(str(e))
-        try:
-            # Define the directory of the output file and metadata paths
-            output_dir = output_file_path.parent
-            metadata_dir = output_dir / "metadata"
-            metadata_file_name = output_file_path.with_suffix(".json").name
-            metadata_file_path = metadata_dir / metadata_file_name
-            # Ensure the metadata directory exists
-            fs.mkdir(create_parents=True, path=metadata_dir)
-            # Remove the "result_text" key from the metadata
-            metadata = {
-                key: value for key, value in output_json.items() if key != "result_text"
-            }
-            metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4)
-            logger.info(f"Writing metadata to {metadata_file_path}")
-            fs.write(
-                path=metadata_file_path, mode="w", data=metadata_json, encoding="utf-8"
-            )
-        except Exception as e:
-            logger.warn(f"Error while writing metadata to {metadata_file_path}: {e}")
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
deleted file mode 100644
index 94d6b246..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import json
-import logging
-import os
-from typing import Any, Optional
-
-import requests
-
-from unstract.sdk.adapters.x2text.constants import X2TextConstants
-from unstract.sdk.adapters.x2text.dto import (
-    TextExtractionMetadata,
-    TextExtractionResult,
-)
-from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import (
-    HTTPMethod,
-    WhispererEndpoint,
-)
-from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.helper import LLMWhispererHelper
-from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
-from unstract.sdk.file_storage import FileStorage, FileStorageProvider
-
-logger = logging.getLogger(__name__)
-
-
-class LLMWhispererV2(X2TextAdapter):
-    def __init__(self, settings: dict[str, Any]):
-        super().__init__("LLMWhispererV2")
-        self.config = settings
-
-    @staticmethod
-    def get_id() -> str:
-        return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
-
-    @staticmethod
-    def get_name() -> str:
-        return "LLMWhisperer V2"
-
-    @staticmethod
-    def get_description() -> str:
-        return "LLMWhisperer V2 X2Text"
-
-    @staticmethod
-    def get_icon() -> str:
-        return "/icons/adapter-icons/LLMWhispererV2.png"
-
-    @staticmethod
-    def get_json_schema() -> str:
-        f = open(f"{os.path.dirname(__file__)}/static/json_schema.json")
-        schema = f.read()
-        f.close()
-        return schema
-
-    def test_connection(self) -> bool:
-        LLMWhispererHelper.make_request(
-            config=self.config,
-            request_method=HTTPMethod.GET,
-            request_endpoint=WhispererEndpoint.TEST_CONNECTION,
-        )
-        return True
-
-    def process(
-        self,
-        input_file_path: str,
-        output_file_path: Optional[str] = None,
-        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
-        **kwargs: dict[Any, Any],
-    ) -> TextExtractionResult:
-        """Used to extract text from documents.
-
-        Args:
-            input_file_path (str): Path to file that needs to be extracted
-            output_file_path (Optional[str], optional): File path to write
-                extracted text into, if None doesn't write to a file.
-                Defaults to None.
-
-        Returns:
-            str: Extracted text
-        """
-
-        response: requests.Response = LLMWhispererHelper.send_whisper_request(
-            input_file_path, self.config, fs=fs
-        )
-        response_text = response.text
-        reponse_dict = json.loads(response_text)
-        metadata = TextExtractionMetadata(
-            whisper_hash=reponse_dict.get(X2TextConstants.WHISPER_HASH_V2, "")
-        )
-
-        return TextExtractionResult(
-            extracted_text=LLMWhispererHelper.extract_text_from_response(
-                self.config, output_file_path, reponse_dict, response, fs=fs
-            ),
-            extraction_metadata=metadata,
-        )
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json
deleted file mode 100644
index b2a62c4b..00000000
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json
+++ /dev/null
@@ -1,144 +0,0 @@
-{
-  "title": "LLMWhisperer v2 Text Extractor",
-  "type": "object",
-  "required": [
-    "adapter_name",
-    "unstract_key",
-    "url"
-  ],
-  "properties": {
-    "adapter_name": {
-      "type": "string",
-      "title": "Name",
-      "default": "llm-whisperer-v2",
-      "description": "Provide a unique name for this adapter instance. Example: LLMWhisperer 1"
-    },
-    "url": {
-      "type": "string",
-      "title": "URL",
-      "format": "uri",
-      "default": "https://llmwhisperer-api.us-central.unstract.com",
-      "description": "Provide the base URL of the LLMWhisperer service based on your region, can be obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)."
-    },
-    "unstract_key": {
-      "type": "string",
-      "title": "Unstract Key",
-      "format": "password",
-      "description": "API key obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)"
-    },
-    "mode": {
-      "type": "string",
-      "title": "Mode",
-      "enum": [
-        "native_text",
-        "low_cost",
-        "high_quality",
-        "form"
-      ],
-      "default": "form",
-      "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)."
-    },
-    "output_mode": {
-      "type": "string",
-      "title": "Output Mode",
-      "enum": [
-        "layout_preserving",
-        "text"
-      ],
-      "default": "layout_preserving",
-      "description": "Output format, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
-    },
-    "line_splitter_tolerance": {
-      "type": "number",
-      "title": "Line Splitter Tolerance",
-      "default": 0.4,
-      "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height"
-    },
-    "line_splitter_strategy": {
-      "type": "string",
-      "title": "Line Splitter Strategy",
-      "default":"left-priority",
-      "description": "An advanced option for customizing the line splitting process."
-    },
-    "horizontal_stretch_factor": {
-      "type": "number",
-      "title": "Horizontal Stretch Factor",
-      "default": 1.0,
-      "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other."
-    },
-    "pages_to_extract": {
-      "type": "string",
-      "title": "Page number(s) or range to extract",
-      "default": "",
-      "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$",
-      "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages."
-    },
-    "page_seperator": {
-      "type": "string",
-      "title": "Page separator",
-      "default": "<<<",
-      "description": "Specify a pattern to separate the pages in the document. This pattern will be inserted at the end of every page (e.g., `<<< {{page_no}} >>>`, `<<< >>>`). Omit `{{page_no}}` if you don't want to include the page number in the separator."
-    },
-    "mark_vertical_lines": {
-      "type": "boolean",
-      "title": "Mark vertical lines",
-      "default": false,
-      "description": "States whether to reproduce vertical lines in the document."
-    },
-    "mark_horizontal_lines": {
-      "type": "boolean",
-      "title": "Mark horizontal lines",
-      "default": false,
-      "description": "States whether to reproduce horizontal lines in the document."
-    },
-    "tag": {
-      "type": "string",
-      "title": "Tag",
-      "default": "default",
-      "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports."
-    },
-    "use_webhook": {
-      "type": "string",
-      "title": "Webhook",
-      "default": "",
-      "description": "The webhook's name which will should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint"
-    },
-    "webhook_metadata": {
-      "type": "string",
-      "title": "Webhook Metadata",
-      "default": "",
-      "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint."
-    }
-  },
-  "if": {
-    "anyOf": [
-      {
-        "properties": {
-          "mode": {
-            "const": "low_cost"
-          }
-        }
-      }
-    ]
-  },
-  "then": {
-    "properties": {
-      "median_filter_size": {
-        "type": "integer",
-        "title": "Median Filter Size",
-        "default": 0,
-        "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
-      },
-      "gaussian_blur_radius": {
-        "type": "number",
-        "title": "Gaussian Blur Radius",
-        "default": 0.0,
-        "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
-      }
-    },
-    "required": [
-      "median_filter_size",
-      "gaussian_blur_radius"
-    ]
-  }
-}

From 4f235a5a9a109e79a1f72259c63662cc50ce3714 Mon Sep 17 00:00:00 2001
From: jagadeeswaran-zipstack <jagadeeswaran@zipstack.com>
Date: Mon, 13 Jan 2025 16:58:50 +0530
Subject: [PATCH 4/5] reverting version updated in init file

---
 src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py
index c4f02191..ba216498 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/__init__.py
@@ -2,7 +2,7 @@
 
 metadata = {
     "name": LLMWhisperer.__name__,
-    "version": "2.0.0",
+    "version": "1.0.0",
     "adapter": LLMWhisperer,
     "description": "LLMWhisperer X2Text adapter",
     "is_active": True,

From 8ab74471e1da4e7c7c45f9ca6222fc97a55dd950 Mon Sep 17 00:00:00 2001
From: jagadeeswaran-zipstack <jagadeeswaran@zipstack.com>
Date: Thu, 16 Jan 2025 10:22:54 +0530
Subject: [PATCH 5/5] adapter name change

---
 .../sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
index 59b61a1d..9f3d862e 100644
--- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
+++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
@@ -44,8 +44,8 @@ def __init__(self, settings: dict[str, Any]):
         
 
     ID = "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93"
-    NAME = "LLMWhisperer V2"
-    DESCRIPTION = "LLMWhisperer V2 X2Text"
+    NAME = "LLMWhisperer"
+    DESCRIPTION = "LLMWhisperer X2Text"
     ICON = "/icons/adapter-icons/LLMWhispererV2.png"
 
     @staticmethod