Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Synce vNext with Main #404

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions app/enrichment/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,7 @@ def poll_queue() -> None:
i = 0
for chunk in chunks:

statusLog.update_document_state( blob_path, f"Indexing {i+1}/{len(chunks)}", State.INDEXING)
# statusLog.update_document_state( blob_path, f"Indexing {i+1}/{len(chunks)}", State.PROCESSING
statusLog.update_document_state( blob_path, f"Indexing {i+1}/{len(chunks)}")
# open the file and extract the content
blob_path_plus_sas = utilities_helper.get_blob_and_sas(
ENV["AZURE_BLOB_STORAGE_CONTAINER"] + '/' + chunk.name)
Expand Down Expand Up @@ -403,7 +402,7 @@ def poll_queue() -> None:
backoff = random.randint(
int(ENV["EMBEDDING_REQUEUE_BACKOFF"]) * requeue_count, max_seconds)
queue_client.send_message(message_string, visibility_timeout=backoff)
statusLog.upsert_document(blob_path, f'Message requeued to embeddings queue, attempt {str(requeue_count)}. Visible in {str(backoff)} seconds. Error: {str(error)}.',
statusLog.upsert_document(blob_path, f'Message requed to embeddings queue, attempt {str(requeue_count)}. Visible in {str(backoff)} seconds. Error: {str(error)}.',
StatusClassification.ERROR,
State.QUEUED)
else:
Expand Down
6 changes: 1 addition & 5 deletions app/frontend/src/api/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,10 @@ export type GetUploadStatusRequest = {
export const enum FileState {
All = "ALL",
Processing = "PROCESSING",
Indexing = "INDEXING",
Skipped = "SKIPPED",
Queued = "QUEUED",
Complete = "COMPLETE",
Error = "ERROR",
THROTTLED = "THROTTLED",
UPLOADED = "UPLOADED"
Error = "ERROR"
}


Expand Down Expand Up @@ -138,7 +135,6 @@ export const enum StatusLogClassification {
// shared code (functions/shared_code/status_log.py)
export const enum StatusLogState {
Processing = "Processing",
Indexing = "Indexing",
Skipped = "Skipped",
Queued = "Queued",
Complete = "Complete",
Expand Down
30 changes: 9 additions & 21 deletions app/frontend/src/components/FileStatus/DocumentsDetailList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,19 @@ export const DocumentsDetailList = ({ items, onFilesSorted}: Props) => {
ariaLabel: 'Column operations for state, Press to sort by states',
onColumnClick: onColumnClick,
data: 'string',
// onRender: (item: IDocument) => (
// <TooltipHost content={`${item.state_description} `}>
// <span>{item.state}</span>
// </TooltipHost>
// ),
onRender: (item: IDocument) => (
<TooltipHost content={`${item.state_description} `}>
<span>{item.state}</span>
</TooltipHost>
),
isPadded: true,
},
{
key: 'column4',
name: 'Submitted On',
fieldName: 'upload_timestamp',
minWidth: 90,
maxWidth: 120,
minWidth: 70,
maxWidth: 90,
isResizable: true,
isCollapsible: true,
ariaLabel: 'Column operations for submitted on date, Press to sort by submitted date',
Expand All @@ -127,8 +127,8 @@ export const DocumentsDetailList = ({ items, onFilesSorted}: Props) => {
key: 'column5',
name: 'Last Updated',
fieldName: 'modified_timestamp',
minWidth: 90,
maxWidth: 120,
minWidth: 70,
maxWidth: 90,
isResizable: true,
isSorted: true,
isSortedDescending: false,
Expand All @@ -142,18 +142,6 @@ export const DocumentsDetailList = ({ items, onFilesSorted}: Props) => {
return <span>{item.modified_timestamp}</span>;
},
},
{
key: 'column6',
name: 'Status Detail',
fieldName: 'state_description',
minWidth: 90,
maxWidth: 200,
isResizable: true,
isCollapsible: true,
ariaLabel: 'Column operations for status detail',
data: 'string',
onColumnClick: onColumnClick
}
]);

return (
Expand Down
3 changes: 0 additions & 3 deletions app/frontend/src/components/FileStatus/FileStatus.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,8 @@ const dropdownFileStateOptions = [
{ key: FileState.Complete, text: 'Completed' },
{ key: FileState.Error, text: 'Error' },
{ key: FileState.Processing, text: 'Processing' },
{ key: FileState.Indexing, text: 'Indexing' },
{ key: FileState.Queued, text: 'Queued' },
{ key: FileState.Skipped, text: 'Skipped'},
{ key: FileState.UPLOADED, text: 'Uploaded'},
{ key: FileState.THROTTLED, text: 'Throttled'},
];

interface Props {
Expand Down
Binary file removed docs/images/frontend-watch.png
Binary file not shown.
Binary file removed docs/images/vite-debug.png
Binary file not shown.
Binary file removed docs/images/webapp-backend.png
Binary file not shown.
25 changes: 1 addition & 24 deletions docs/knownissues.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,33 +101,10 @@ InvalidApiSetId - The account type 'OpenAI' is either invalid or unavailable in
### Solution:
Deploy Azure OpenAI Service only in the supported regions. Review the local.env file and update the location as per supported models and [region availability](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#model-summary-table-and-region-availability)


## Error: jq parse error: Expected value before ','

If you see a jq parse error while doing deployments, it means one of the makefile scripts to extract environment variables is failing to find a value it expects to be there. The files related would be the main.parameters.json file which is the variables from bicep output from the infrastructure create. The other would be the env file used during build and deploy time

### Solution:
To resolve carefully check your deployment .env file for any missing but required values. There are rare times when ARM has issues and output values are not written. In which case simply double check your configuration and rerun the ```make deploy``` and/or ```make extract-env``` command so that the bicep outputs can be written again

## Error: Creation of new Media Service accounts are not allowed as the resource has been deprecated

### Solution:
Media Services is scheduled for 30th June 2024. This is the [guide](https://learn.microsoft.com/en-us/azure/media-services/latest/azure-media-services-retirement). On deeper investigation Video Indexer, which is the service we use that sits on top of Media Services, will switch away from this before the end date....

```
Is Azure Video Indexer being retired?
No, Azure Video Indexer isn't part of the Media Services retirement. Although Video Indexer currently relies on a Media Services account as part of its workflow, this dependency will be eliminated before Media Services is retired on June 30, 2024. See the following for more [impact of Media Services retirement for Video Indexer](https://aka.ms/vi-ams-retirement-announcement)
```

As of today, Video Indexer still requires a Media Services service to be created, and so we can't remove it from bicep deployment. We will need to assess closer to the date if VI is working without the service and we can then remove the dependency.

The error is interesting as it seems to indicate the media service cannot be created. This is not the case, it does work in regions where VI and Media Services are available. I have updated this to an enhancement and we will add a ticket to the board to action this when VI can be deployed without this supporting service.

## Error: Token limit often exceeded with PDF files

### Solution:

The root of this is table processing. If a table is greater than our target token count for a chunk, this is not respected.Essentially tables are not chunked, but treated as units. We have added a task to our board to split tables by chunk size and repeat the table header rows in each chunk..

When we switched to using unstructured.io for non-PDF documents, we were aware of the same issue there. They were planning on adding this feature. So, we need to make the change in our code, and follow up with unstructured to confirm if this has been fixed and update that path also.

This issue has been updated to an enhancement.
Binary file modified docs/process_flow.drawio.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 5 additions & 13 deletions docs/webapp_debug.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,15 @@ The app consists of two layers, namely the frontend user interface components an

To debug the webapp, both frontend and backend, first set breakpoints in your code under the frontend and/or backend. Select the 'Run & Debug' tab from the sidebar in VS Code. Select Python: Flask from the dropdown and hit run. This will initiate local debugging of the backend code.

Next verify you have a virtual environment created, which should be seen as a folder called .venv under the root of your workspace. If this doesn't exists you can create one by following these steps:
![backend debugging](/docs/images/webapp_debug_1.png)

1. Opening the command palette (Ctrl+Shift+P)
1. Select the command Python: Create Environment
1. Next select Venv
1. Now select the latest version of Python from the list
1. Finally enter check marks next to all requirements.txt files listed and hit OK
Next, you will need to initiate debugging of the frontend code. To do this select 'Vite: Debug' from the drop down and hit run.

This will initiate frontend running and debugging. A browser will open and show the web app running under localhost:5000. Next proceed to interact with the web app, by asking a question. In the VS Code interface, your code will hit the breakpoints, frontend or backend, and you will be able to view variable, trace logic etc. You can switch between the two running debuggers by selecting frontend or backend (flask or vite) from the debug dropdown.

Now initiate debugging of the front end code by selecting 'Frontend: watch' and then hitting run
![backend debugging](/docs/images/frontend-watch.png)
![frontend debugging](/docs/images/webapp_debug_2.png)

Finally hit Vite: Debug
![backend debugging](/docs/images/vite-debug.png)
This will initiate frontend running and debugging. A browser will open and show the web app running under localhost:5000. Next proceed to interact with the web app, by asking a question. In the VS Code interface, your code will hit the breakpoints, frontend or backend, and you will be able to view variable, trace logic etc. You can switch between the two running debuggers by selecting frontend or backend (flask or vite) from the debug dropdown.

A browser will open and show the web app running under localhost:5000. Next proceed to interact with the web app, by asking a question. In the VS Code interface, you code will hit the breakpoints, frontend or backend, and you will be able to view variable, trace logic etc. You can switch between the two running debuggers by selecting frontend or backend (flask or vite) from the debug dropdown.
![frontend debugging](/docs/images/webapp_debug_3.png)

## Known Issues

Expand Down
2 changes: 1 addition & 1 deletion functions/TextEnrichment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def main(msg: func.QueueMessage) -> None:

statusLog.upsert_document(
blob_path,
f"{FUNCTION_NAME} - Text enrichment is complete, message sent to embeddings queue",
f"{FUNCTION_NAME} - Text enrichment is complete",
StatusClassification.DEBUG,
State.QUEUED,
)
Expand Down
23 changes: 7 additions & 16 deletions functions/shared_code/status_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
class State(Enum):
""" Enum for state of a process """
PROCESSING = "Processing"
INDEXING = "Indexing"
SKIPPED = "Skipped"
QUEUED = "Queued"
COMPLETE = "Complete"
Expand Down Expand Up @@ -156,9 +155,6 @@ def upsert_document(self, document_path, status, status_classification: StatusCl
if json_document['state'] != state.value:
json_document['state'] = state.value
json_document['state_timestamp'] = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# Update state description with latest status
json_document['state_description'] = status

# Append a new item to the array
status_updates = json_document["status_updates"]
Expand All @@ -180,7 +176,7 @@ def upsert_document(self, document_path, status, status_classification: StatusCl
"file_name": base_name,
"state": str(state.value),
"start_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"state_description": status,
"state_description": "",
"state_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"status_updates": [
{
Expand All @@ -198,7 +194,7 @@ def upsert_document(self, document_path, status, status_classification: StatusCl
"file_name": base_name,
"state": str(state.value),
"start_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"state_description": status,
"state_description": "",
"state_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"status_updates": [
{
Expand All @@ -212,34 +208,29 @@ def upsert_document(self, document_path, status, status_classification: StatusCl

#self.container.upsert_item(body=json_document)
self._log_document[document_id] = json_document


def update_document_state(self, document_path, status, state=State.PROCESSING):

def update_document_state(self, document_path, state_str):
"""Updates the state of the document in the storage"""
try:
document_id = self.encode_document_id(document_path)
logging.info(f"{state_str} DocumentID - {document_id}")
logging.info(f"{status} DocumentID - {document_id}")
document_id = self.encode_document_id(document_path)
if self._log_document.get(document_id, "") != "":
json_document = self._log_document[document_id]

json_document['state'] = state.value
json_document['state_description'] = status
json_document['state'] = state_str
json_document['state_timestamp'] = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
self.save_document(document_path)
self._log_document[document_id] = json_document
else:
logging.warning(f"Document with ID {document_id} not found.")
except Exception as err:
logging.error(f"An error occurred while updating the document state: {str(err)}")

logging.error(f"An error occurred while updating the document state: {str(err)}")

def save_document(self, document_path):
"""Saves the document in the storage"""
document_id = self.encode_document_id(document_path)
self.container.upsert_item(body=self._log_document[document_id])
self._log_document[document_id] = ""


def get_stack_trace(self):
""" Returns the stack trace of the current exception"""
Expand Down
1 change: 1 addition & 0 deletions scripts/inf-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ if [ -n "${IN_AUTOMATION}" ]; then
echo "Please create the Azure AD objects using the script at /scripts/create-ad-objs-for-deployment.sh and set the AD_WEBAPP_CLIENT_ID pipeline variable in Azure DevOps."
exit 1
fi
aadWebSPId=$ARM_SERVICE_PRINCIPAL_ID
aadMgmtAppId=$AD_MGMTAPP_CLIENT_ID
aadMgmtAppSecret=$AD_MGMTAPP_CLIENT_SECRET
aadMgmtSPId=$AD_MGMT_SERVICE_PRINCIPAL_ID
Expand Down
Loading