Skip to content

Commit

Permalink
Merge pull request #798 from ruandersMSFT/Language_v3
Browse files Browse the repository at this point in the history
Update Language Detection and Translation to use Azure Cognitive Multi Service API
  • Loading branch information
dayland authored Aug 8, 2024
2 parents 79902d6 + 497a769 commit f295135
Show file tree
Hide file tree
Showing 15 changed files with 55 additions and 89 deletions.
3 changes: 0 additions & 3 deletions app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@
"TARGET_TRANSLATION_LANGUAGE": "en",
"ENRICHMENT_ENDPOINT": None,
"ENRICHMENT_KEY": None,
"AZURE_AI_TRANSLATION_DOMAIN": "api.cognitive.microsofttranslator.com",
"BING_SEARCH_ENDPOINT": "https://api.bing.microsoft.com/",
"BING_SEARCH_KEY": "",
"ENABLE_BING_SAFE_SEARCH": "true",
Expand Down Expand Up @@ -208,7 +207,6 @@
ENV["TARGET_TRANSLATION_LANGUAGE"],
ENV["ENRICHMENT_ENDPOINT"],
ENV["ENRICHMENT_KEY"],
ENV["AZURE_AI_TRANSLATION_DOMAIN"],
str_to_bool.get(ENV["USE_SEMANTIC_RERANKER"])
),
Approaches.ChatWebRetrieveRead: ChatWebRetrieveRead(
Expand Down Expand Up @@ -246,7 +244,6 @@
ENV["TARGET_TRANSLATION_LANGUAGE"],
ENV["ENRICHMENT_ENDPOINT"],
ENV["ENRICHMENT_KEY"],
ENV["AZURE_AI_TRANSLATION_DOMAIN"],
str_to_bool.get(ENV["USE_SEMANTIC_RERANKER"])
),
Approaches.GPTDirect: GPTDirectApproach(
Expand Down
26 changes: 16 additions & 10 deletions app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def __init__(
target_translation_language: str,
enrichment_endpoint:str,
enrichment_key:str,
azure_ai_translation_domain: str,
use_semantic_reranker: bool

):
Expand All @@ -125,7 +124,6 @@ def __init__(
self.enrichment_key=enrichment_key
self.oai_endpoint=oai_endpoint
self.embedding_service_url = enrichment_appservice_uri
self.azure_ai_translation_domain=azure_ai_translation_domain
self.use_semantic_reranker=use_semantic_reranker

openai.api_base = oai_endpoint
Expand Down Expand Up @@ -442,18 +440,28 @@ async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]
def detect_language(self, text: str) -> str:
""" Function to detect the language of the text"""
try:
endpoint_region = self.enrichment_endpoint.split("https://")[1].split(".api")[0]
api_detect_endpoint = f"https://{self.azure_ai_translation_domain}/detect?api-version=3.0"
api_detect_endpoint = f"{self.enrichment_endpoint}language/:analyze-text?api-version=2023-04-01"
headers = {
'Ocp-Apim-Subscription-Key': self.enrichment_key,
'Content-type': 'application/json',
'Ocp-Apim-Subscription-Region': endpoint_region
}
data = [{"text": text}]

data = {
"kind": "LanguageDetection",
"analysisInput":{
"documents":[
{
"id":"1",
"text": text
}
]
}
}

response = requests.post(api_detect_endpoint, headers=headers, json=data)

if response.status_code == 200:
detected_language = response.json()[0]['language']
detected_language = response.json()["results"]["documents"][0]["detectedLanguage"]["iso6391Name"]
return detected_language
else:
raise Exception(f"Error detecting language: {response.status_code}")
Expand All @@ -462,12 +470,10 @@ def detect_language(self, text: str) -> str:

def translate_response(self, response: str, target_language: str) -> str:
""" Function to translate the response to target language"""
endpoint_region = self.enrichment_endpoint.split("https://")[1].split(".api")[0]
api_translate_endpoint = f"https://{self.azure_ai_translation_domain}/translate?api-version=3.0"
api_translate_endpoint = f"{self.enrichment_endpoint}translator/text/v3.0/translate?api-version=3.0"
headers = {
'Ocp-Apim-Subscription-Key': self.enrichment_key,
'Content-type': 'application/json',
'Ocp-Apim-Subscription-Region': endpoint_region
}
params={'to': target_language }
data = [{
Expand Down
3 changes: 0 additions & 3 deletions app/backend/approaches/comparewebwithwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def __init__(
target_translation_language: str,
enrichment_endpoint:str,
enrichment_key:str,
azure_ai_translation_domain: str,
use_semantic_reranker: bool
):
self.search_client = search_client
Expand All @@ -84,7 +83,6 @@ def __init__(
self.model_name = model_name
self.model_version = model_version
self.enrichment_appservice_url = enrichment_appservice_url
self.azure_ai_translation_domain = azure_ai_translation_domain
self.use_semantic_reranker = use_semantic_reranker

# openai.api_base = oai_endpoint
Expand Down Expand Up @@ -126,7 +124,6 @@ async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]
self.target_translation_language,
self.enrichment_endpoint,
self.enrichment_key,
self.azure_ai_translation_domain,
self.use_semantic_reranker
)
rrr_response = chat_rrr_approach.run(history, overrides, {}, thought_chain)
Expand Down
1 change: 0 additions & 1 deletion docs/features/document_pre_processing.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ SUBMIT_REQUEUE_HIDE_SECONDS | If a throttling event occurs on upload, the messag
TARGET_TRANSLATION_LANGUAGE | The target language that the process will translate chunks into
ENRICHMENT_BACKOFF | The number of seconds a message will be invisible when resubmitted to the enrichment queue after a failure due to throttling. This will increase exponentially for every subsequent time a failure occurs
MAX_ENRICHMENT_REQUEUE_COUNT | The maximum number of times a message will be pushed to the enrichment queue after a failure in the enrichment function
TARGET_TRANSLATION_LANGUAGE | The language you wish all chunks to be translated to
FR_API_VERSION | The API version of [Azure AI Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/overview?view=doc-intel-3.1.0) which you wish to use

Likewise, below are some configuration values of the App Service that you may wish to adapt to your scenario
Expand Down
25 changes: 18 additions & 7 deletions functions/ImageEnrichment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
azure_blob_content_storage_container = os.environ[
"BLOB_STORAGE_ACCOUNT_OUTPUT_CONTAINER_NAME"
]
azure_ai_translation_domain = os.environ["AZURE_AI_TRANSLATION_DOMAIN"]

enrichmentEndpoint = os.environ["ENRICHMENT_ENDPOINT"]

# Cosmos DB
cosmosdb_url = os.environ["COSMOSDB_URL"]
Expand All @@ -50,17 +51,16 @@
targetTranslationLanguage = os.environ["TARGET_TRANSLATION_LANGUAGE"]

API_DETECT_ENDPOINT = (
f"https://{azure_ai_translation_domain}/detect?api-version=3.0"
f"{enrichmentEndpoint}language/:analyze-text?api-version=2023-04-01"
)
API_TRANSLATE_ENDPOINT = (
f"https://{azure_ai_translation_domain}/translate?api-version=3.0"
f"{enrichmentEndpoint}translator/text/v3.0/translate?api-version=3.0"
)

MAX_CHARS_FOR_DETECTION = 1000
translator_api_headers = {
"Ocp-Apim-Subscription-Key": cognitive_services_key,
"Content-type": "application/json",
"Ocp-Apim-Subscription-Region": cognitive_services_account_location,
}

# Note that "caption" and "denseCaptions" are only supported in Azure GPU regions (East US, France Central,
Expand Down Expand Up @@ -94,14 +94,25 @@


def detect_language(text):
data = [{"text": text[:MAX_CHARS_FOR_DETECTION]}]
data = {
"kind": "LanguageDetection",
"analysisInput":{
"documents":[
{
"id":"1",
"text": text[:MAX_CHARS_FOR_DETECTION]
}
]
}
}

response = requests.post(
API_DETECT_ENDPOINT, headers=translator_api_headers, json=data
)
if response.status_code == 200:
print(response.json())
detected_language = response.json()[0]["language"]
detection_confidence = response.json()[0]["score"]
detected_language = response.json()["results"]["documents"][0]["detectedLanguage"]["iso6391Name"]
detection_confidence = response.json()["results"]["documents"][0]["detectedLanguage"]["confidenceScore"]

return detected_language, detection_confidence

Expand Down
37 changes: 21 additions & 16 deletions functions/TextEnrichment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@
enrichment_backoff = int(os.environ["ENRICHMENT_BACKOFF"])
azure_blob_content_storage_container = os.environ["BLOB_STORAGE_ACCOUNT_OUTPUT_CONTAINER_NAME"]
queueName = os.environ["EMBEDDINGS_QUEUE"]
azure_ai_translation_domain = os.environ["AZURE_AI_TRANSLATION_DOMAIN"]
azure_ai_text_analytics_domain = os.environ["AZURE_AI_TEXT_ANALYTICS_DOMAIN"]

FUNCTION_NAME = "TextEnrichment"
MAX_CHARS_FOR_DETECTION = 1000
Expand All @@ -61,11 +59,8 @@ def main(msg: func.QueueMessage) -> None:
the target language, it will translate the chunks to the target language.'''

try:
endpoint_region = enrichmentEndpoint.split("https://")[1].split(".api")[0]

apiDetectEndpoint = f"https://{azure_ai_translation_domain}/detect?api-version=3.0"
apiTranslateEndpoint = f"https://{azure_ai_translation_domain}/translate?api-version=3.0"
enrich_endpoint = f"https://{endpoint_region}.{azure_ai_text_analytics_domain}/language/:analyze-text?api-version=2022-05-01"
apiTranslateEndpoint = f"{enrichmentEndpoint}translator/text/v3.0/translate?api-version=3.0"
apiLanguageEndpoint = f"{enrichmentEndpoint}language/:analyze-text?api-version=2023-04-01"

message_body = msg.get_body().decode("utf-8")
message_json = json.loads(message_body)
Expand Down Expand Up @@ -109,14 +104,24 @@ def main(msg: func.QueueMessage) -> None:
# detect language
headers = {
'Ocp-Apim-Subscription-Key': enrichmentKey,
'Content-type': 'application/json',
'Ocp-Apim-Subscription-Region': endpoint_region
}
data = [{"text": chunk_content}]
'Content-type': 'application/json'
}

data = {
"kind": "LanguageDetection",
"analysisInput":{
"documents":[
{
"id":"1",
"text": chunk_content
}
]
}
}

response = requests.post(apiDetectEndpoint, headers=headers, json=data)
response = requests.post(apiLanguageEndpoint, headers=headers, json=data)
if response.status_code == 200:
detected_language = response.json()[0]['language']
detected_language = response.json()["results"]["documents"][0]["detectedLanguage"]["iso6391Name"]
statusLog.upsert_document(
blob_path,
f"{FUNCTION_NAME} - detected language of text is {detected_language}.",
Expand All @@ -127,7 +132,7 @@ def main(msg: func.QueueMessage) -> None:
# error or requeue
requeue(response, message_json)
return

# If the language of the document is not equal to target language then translate the generated chunks
if detected_language != targetTranslationLanguage:
statusLog.upsert_document(
Expand Down Expand Up @@ -173,7 +178,7 @@ def main(msg: func.QueueMessage) -> None:
]
}
}
response = requests.post(enrich_endpoint, headers=enrich_headers, json=enrich_data, params=params)
response = requests.post(apiLanguageEndpoint, headers=enrich_headers, json=enrich_data, params=params)
try:
entities = response.json()['results']['documents'][0]['entities']
except:
Expand Down Expand Up @@ -204,7 +209,7 @@ def main(msg: func.QueueMessage) -> None:
]
}
}
response = requests.post(enrich_endpoint, headers=enrich_headers, json=enrich_data, params=params)
response = requests.post(apiLanguageEndpoint, headers=enrich_headers, json=enrich_data, params=params)
try:
key_phrases = response.json()['results']['documents'][0]['keyPhrases']
except:
Expand Down
2 changes: 0 additions & 2 deletions infra/core/host/functions/functions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,6 @@ resource "azurerm_linux_function_app" "function_app" {
COSMOSDB_KEY = "@Microsoft.KeyVault(SecretUri=${var.keyVaultUri}secrets/COSMOSDB-KEY)"
AZURE_SEARCH_SERVICE_ENDPOINT = var.azureSearchServiceEndpoint
AZURE_SEARCH_INDEX = var.azureSearchIndex
AZURE_AI_TRANSLATION_DOMAIN = var.azure_ai_translation_domain
AZURE_AI_TEXT_ANALYTICS_DOMAIN = var.azure_ai_text_analytics_domain
}

identity {
Expand Down
8 changes: 0 additions & 8 deletions infra/core/host/functions/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,3 @@ variable "endpointSuffix" {
type = string
default = "core.windows.net"
}

variable "azure_ai_translation_domain" {
type = string
}

variable "azure_ai_text_analytics_domain" {
type = string
}
3 changes: 0 additions & 3 deletions infra/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ module "backend" {
ENRICHMENT_APPSERVICE_URL = module.enrichmentApp.uri
ENRICHMENT_ENDPOINT = module.cognitiveServices.cognitiveServiceEndpoint
APPLICATION_TITLE = var.applicationtitle == "" ? "Information Assistant, built with Azure OpenAI" : var.applicationtitle
AZURE_AI_TRANSLATION_DOMAIN = var.azure_ai_translation_domain
USE_SEMANTIC_RERANKER = var.use_semantic_reranker
BING_SEARCH_ENDPOINT = var.enableWebChat ? module.bingSearch.endpoint : ""
ENABLE_WEB_CHAT = var.enableWebChat
Expand Down Expand Up @@ -348,8 +347,6 @@ module "functions" {
azureSearchIndex = var.searchIndexName
azureSearchServiceEndpoint = module.searchServices.endpoint
endpointSuffix = var.azure_storage_domain
azure_ai_text_analytics_domain = var.azure_ai_text_analytics_domain
azure_ai_translation_domain = var.azure_ai_translation_domain

depends_on = [
module.storage,
Expand Down
8 changes: 0 additions & 8 deletions infra/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,6 @@ output "ENABLE_BING_SAFE_SEARCH" {
value = var.enableBingSafeSearch
}

output "AZURE_AI_TRANSLATION_DOMAIN" {
value = var.azure_ai_translation_domain
}

output "AZURE_AI_TEXT_ANALYTICS_DOMAIN" {
value = var.azure_ai_text_analytics_domain
}

output "AZURE_ARM_MANAGEMENT_API" {
value = var.azure_arm_management_api
}
Expand Down
8 changes: 0 additions & 8 deletions infra/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,6 @@ variable "azure_arm_management_api" {
type = string
}

variable "azure_ai_translation_domain" {
type = string
}

variable "azure_ai_text_analytics_domain" {
type = string
}

variable "azure_search_domain" {
type = string
}
Expand Down
2 changes: 0 additions & 2 deletions scripts/environments/AzureEnvironments/AzureCloud.env
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
export TF_VAR_arm_template_schema_mgmt_api="https://schema.management.azure.com"
export TF_VAR_azure_portal_domain="https://portal.azure.com"
export TF_VAR_azure_ai_translation_domain="api.cognitive.microsofttranslator.com"
export TF_VAR_azure_ai_text_analytics_domain="api.cognitive.microsoft.com"
export TF_VAR_azure_search_domain="search.windows.net"
export TF_VAR_use_semantic_reranker=true
export TF_VAR_azure_storage_domain="core.windows.net"
Expand Down
2 changes: 0 additions & 2 deletions scripts/environments/AzureEnvironments/AzureUSGovernment.env
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
export TF_VAR_arm_template_schema_mgmt_api="https://schema.management.usgovcloudapi.net"
export TF_VAR_azure_portal_domain="https://portal.azure.us"
export TF_VAR_azure_ai_translation_domain="api.cognitive.microsofttranslator.us"
export TF_VAR_azure_ai_text_analytics_domain="api.cognitive.microsoft.us"
export TF_VAR_azure_search_domain="search.azure.us"
export TF_VAR_use_semantic_reranker=false
export TF_VAR_azure_storage_domain="core.usgovcloudapi.net"
Expand Down
8 changes: 0 additions & 8 deletions scripts/json-to-env.function.debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,6 @@ jq -r --arg secrets "$secrets" '
{
"path": "DEPLOYMENT_KEYVAULT_NAME",
"env_var": "DEPLOYMENT_KEYVAULT_NAME"
},
{
"path": "AZURE_AI_TRANSLATION_DOMAIN",
"env_var": "AZURE_AI_TRANSLATION_DOMAIN"
},
{
"path": "AZURE_AI_TEXT_ANALYTICS_DOMAIN",
"env_var": "AZURE_AI_TEXT_ANALYTICS_DOMAIN"
}
]
as $env_vars_to_extract
Expand Down
8 changes: 0 additions & 8 deletions scripts/json-to-env.webapp.debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,6 @@ jq -r '
"path": "ENABLEE_BING_SAFE_SEARCH",
"env_var": "ENABLE_BING_SAFE_SEARCH"
},
{
"path": "AZURE_AI_TRANSLATION_DOMAIN",
"env_var": "AZURE_AI_TRANSLATION_DOMAIN"
},
{
"path": "AZURE_AI_TEXT_ANALYTICS_DOMAIN",
"env_var": "AZURE_AI_TEXT_ANALYTICS_DOMAIN"
},
{
"path": "AZURE_ARM_MANAGEMENT_API",
"env_var": "AZURE_ARM_MANAGEMENT_API"
Expand Down

0 comments on commit f295135

Please sign in to comment.