-
Notifications
You must be signed in to change notification settings - Fork 5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Google Search API #3049
base: 0.2
Are you sure you want to change the base?
Google Search API #3049
Changes from 5 commits
f4ca7c0
f8d684a
7e4d8ec
14bdd4a
869bcf3
2423a11
81ef40d
aec42be
bd45a68
65b096c
d467dc4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ __pycache__/ | |
*.so | ||
|
||
# Distribution / packaging | ||
.devcontainer/ | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,13 +9,16 @@ | |
from typing_extensions import Annotated | ||
|
||
from ... import Agent, AssistantAgent, ConversableAgent, GroupChat, GroupChatManager, OpenAIWrapper, UserProxyAgent | ||
from ...browser_utils import SimpleTextBrowser | ||
from ...browser_utils.bing_browser import BingTextBrowser | ||
from ...browser_utils.google_broswer import GoogleTextBrowser | ||
from ...code_utils import content_str | ||
from ...oai.openai_utils import filter_config | ||
from ...token_count_utils import count_token, get_max_token_limit | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
BROWSERS = {"google": GoogleTextBrowser, "bing": BingTextBrowser} | ||
|
||
|
||
class WebSurferAgent(ConversableAgent): | ||
"""(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages.""" | ||
|
@@ -40,6 +43,7 @@ def __init__( | |
llm_config: Optional[Union[Dict, Literal[False]]] = None, | ||
summarizer_llm_config: Optional[Union[Dict, Literal[False]]] = None, | ||
default_auto_reply: Optional[Union[str, Dict, None]] = "", | ||
browser_name: str = "bing", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we use enum for this lookup table? |
||
browser_config: Optional[Union[Dict, None]] = None, | ||
): | ||
super().__init__( | ||
|
@@ -58,7 +62,9 @@ def __init__( | |
self._create_summarizer_client(summarizer_llm_config, llm_config) | ||
|
||
# Create the browser | ||
self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) | ||
self.browser_name = browser_name | ||
chosen_browser = BROWSERS[self.browser_name] | ||
self.browser = chosen_browser(**(browser_config if browser_config else {})) | ||
|
||
inner_llm_config = copy.deepcopy(llm_config) | ||
|
||
|
@@ -136,7 +142,7 @@ def _browser_state() -> Tuple[str, str]: | |
description="Perform an INFORMATIONAL web search query then return the search results.", | ||
) | ||
def _informational_search(query: Annotated[str, "The informational web search query to perform."]) -> str: | ||
self.browser.visit_page(f"bing: {query}") | ||
self.browser.visit_page(f"{self.browser_name}: {query}") | ||
header, content = _browser_state() | ||
return header.strip() + "\n=======================\n" + content | ||
|
||
|
@@ -146,7 +152,7 @@ def _informational_search(query: Annotated[str, "The informational web search qu | |
description="Perform a NAVIGATIONAL web search query then immediately navigate to the top result. Useful, for example, to navigate to a particular Wikipedia article or other known destination. Equivalent to Google's \"I'm Feeling Lucky\" button.", | ||
) | ||
def _navigational_search(query: Annotated[str, "The navigational web search query to perform."]) -> str: | ||
self.browser.visit_page(f"bing: {query}") | ||
self.browser.visit_page(f"{self.browser_name}: {query}") | ||
|
||
# Extract the first linl | ||
m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
import io | ||
import mimetypes | ||
import os | ||
import re | ||
import uuid | ||
from typing import Any, Dict, List, Optional, Tuple, Union, overload | ||
from urllib.parse import urljoin, urlparse | ||
|
||
import markdownify | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
# Optional PDF support | ||
IS_PDF_CAPABLE = False | ||
try: | ||
import pdfminer | ||
import pdfminer.high_level | ||
|
||
IS_PDF_CAPABLE = True | ||
except ModuleNotFoundError: | ||
pass | ||
|
||
# Other optional dependencies | ||
try: | ||
import pathvalidate | ||
except ModuleNotFoundError: | ||
pass | ||
|
||
|
||
class SimpleTextBrowser: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we could have a better name to highlight that this is the class to be derived for creating other TextBrowsers, TextBroswerBase ? Also do we want to have a class with virtual methods to override or a pure abstract and then SimpleTextBrowser is just an implementation of it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @colombod I believe a base class would suffice and we can override the methods we need from there. More than this is honestly over engineering and will add unnecessary complexity. Let me know your opinion. |
||
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" | ||
|
||
def __init__( | ||
self, | ||
start_page: Optional[str] = None, | ||
viewport_size: Optional[int] = 1024 * 8, | ||
downloads_folder: Optional[Union[str, None]] = None, | ||
base_url: str = None, | ||
api_key: Optional[Union[str, None]] = None, | ||
request_kwargs: Optional[Union[Dict[str, Any], None]] = None, | ||
): | ||
self.start_page: str = start_page if start_page else "about:blank" | ||
self.viewport_size = viewport_size # Applies only to the standard uri types | ||
self.downloads_folder = downloads_folder | ||
self.history: List[str] = list() | ||
self.page_title: Optional[str] = None | ||
self.viewport_current_page = 0 | ||
self.viewport_pages: List[Tuple[int, int]] = list() | ||
self.set_address(self.start_page) | ||
self.base_url = base_url | ||
self.api_key = api_key | ||
self.request_kwargs = request_kwargs | ||
|
||
self._page_content = "" | ||
|
||
@property | ||
def address(self) -> str: | ||
"""Return the address of the current page.""" | ||
return self.history[-1] | ||
|
||
@overload | ||
def set_address(self, uri_or_path: str) -> None: | ||
self.history.append(uri_or_path) | ||
|
||
self.viewport_current_page = 0 | ||
|
||
@property | ||
def viewport(self) -> str: | ||
"""Return the content of the current viewport.""" | ||
bounds = self.viewport_pages[self.viewport_current_page] | ||
return self.page_content[bounds[0] : bounds[1]] | ||
|
||
@property | ||
def page_content(self) -> str: | ||
"""Return the full contents of the current page.""" | ||
return self._page_content | ||
|
||
def _set_page_content(self, content: str) -> None: | ||
"""Sets the text content of the current page.""" | ||
self._page_content = content | ||
self._split_pages() | ||
if self.viewport_current_page >= len(self.viewport_pages): | ||
self.viewport_current_page = len(self.viewport_pages) - 1 | ||
|
||
def page_down(self) -> None: | ||
self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1) | ||
|
||
def page_up(self) -> None: | ||
self.viewport_current_page = max(self.viewport_current_page - 1, 0) | ||
|
||
def visit_page(self, path_or_uri: str) -> str: | ||
"""Update the address, visit the page, and return the content of the viewport.""" | ||
self.set_address(path_or_uri) | ||
return self.viewport | ||
|
||
def _split_pages(self) -> None: | ||
# Split only regular pages | ||
if not self.address.startswith("http:") and not self.address.startswith("https:"): | ||
self.viewport_pages = [(0, len(self._page_content))] | ||
return | ||
|
||
# Handle empty pages | ||
if len(self._page_content) == 0: | ||
self.viewport_pages = [(0, 0)] | ||
return | ||
|
||
# Break the viewport into pages | ||
self.viewport_pages = [] | ||
start_idx = 0 | ||
while start_idx < len(self._page_content): | ||
end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator] | ||
# Adjust to end on a space | ||
while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]: | ||
end_idx += 1 | ||
self.viewport_pages.append((start_idx, end_idx)) | ||
start_idx = end_idx | ||
|
||
def _fetch_page(self, url: str) -> None: | ||
try: | ||
# Prepare the request parameters | ||
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you considered the benefits of using Playwright as opposed to a get request? A lot of pages will need dom composition and js execution to fully compose a usable page. Llamaindex web laoder is doing the same. That will make a much more reliable user experience and will meet expectations for we scraping tasks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have not thought about using a third party library. As daunting as it might sound, keeping it simple and as it was was my first approach. However, we can go with either Playwright or Llamaindex web loader both are fine. But then we will need to make sure that the whole library is implementing the requests the same way. Let me know your thoughts. @colombod There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The proposal I am doing is to just use that and move from a web get into actual web navigation. Playwright is what also LlamaIndex is using for navigation, that will also be useful as we see more multimodel approach There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @colombod I started working on adding Playwright into the PR. However, it seems its async functionality is a bit unstable giving me timeout errors, then working properly on another request. I am not sure if you have any experience of overcoming this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is not the experience I have with it, can you show one of the issues you are facing? Maybe I can help or we could get Playwright engineers to investigate. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @colombod Well, I have been trying in the past couple of weeks. But I always get these failed tests FAILED test/test_browser_utils_google.py::test_simple_text_browser - assert 'Redmond' in 'Page.goto: net::ERR_ABORTED at https://en.wikipedia.org/wiki/Microsoft\nCall log:\nnavigating to "https://en.wikipedia.org/wiki/Microsoft", waiting until "networkidle"\n'
FAILED test/test_browser_utils_google.py::test_google_search - assert "A Google search for 'Microsoft' found" in 'Page.goto: net::ERR_ABORTED at google: Microsoft\nCall log:\nnavigating to "google: Microsoft", waiting until "networkidle"\n'
Although I have the timeout increased to 600 seconds response = await self._page.goto(url, wait_until='networkidle', timeout=600000) |
||
request_kwargs["stream"] = True | ||
|
||
# Send a HTTP request to the URL | ||
response = requests.get(url, **request_kwargs) | ||
response.raise_for_status() | ||
|
||
# If the HTTP request returns a status code 200, proceed | ||
if response.status_code == 200: | ||
content_type = response.headers.get("content-type", "") | ||
for ct in ["text/html", "text/plain", "application/pdf"]: | ||
if ct in content_type.lower(): | ||
content_type = ct | ||
break | ||
|
||
if content_type == "text/html": | ||
# Get the content of the response | ||
html = "" | ||
for chunk in response.iter_content(chunk_size=512, decode_unicode=True): | ||
html += chunk | ||
|
||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# Remove javascript and style blocks | ||
for script in soup(["script", "style"]): | ||
script.extract() | ||
|
||
# Convert to markdown -- Wikipedia gets special attention to get a clean version of the page | ||
if url.startswith("https://en.wikipedia.org/"): | ||
body_elm = soup.find("div", {"id": "mw-content-text"}) | ||
title_elm = soup.find("span", {"class": "mw-page-title-main"}) | ||
|
||
if body_elm: | ||
# What's the title | ||
main_title = soup.title.string | ||
if title_elm and len(title_elm) > 0: | ||
main_title = title_elm.string | ||
webpage_text = ( | ||
"# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm) | ||
) | ||
else: | ||
webpage_text = markdownify.MarkdownConverter().convert_soup(soup) | ||
else: | ||
webpage_text = markdownify.MarkdownConverter().convert_soup(soup) | ||
|
||
# Convert newlines | ||
webpage_text = re.sub(r"\r\n", "\n", webpage_text) | ||
|
||
# Remove excessive blank lines | ||
self.page_title = soup.title.string | ||
self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) | ||
elif content_type == "text/plain": | ||
# Get the content of the response | ||
plain_text = "" | ||
for chunk in response.iter_content(chunk_size=512, decode_unicode=True): | ||
plain_text += chunk | ||
|
||
self.page_title = None | ||
self._set_page_content(plain_text) | ||
elif IS_PDF_CAPABLE and content_type == "application/pdf": | ||
pdf_data = io.BytesIO(response.raw.read()) | ||
self.page_title = None | ||
self._set_page_content(pdfminer.high_level.extract_text(pdf_data)) | ||
elif self.downloads_folder is not None: | ||
# Try producing a safe filename | ||
fname = None | ||
try: | ||
fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip() | ||
except NameError: | ||
pass | ||
|
||
# No suitable name, so make one | ||
if fname is None: | ||
extension = mimetypes.guess_extension(content_type) | ||
if extension is None: | ||
extension = ".download" | ||
fname = str(uuid.uuid4()) + extension | ||
|
||
# Open a file for writing | ||
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) | ||
with open(download_path, "wb") as fh: | ||
for chunk in response.iter_content(chunk_size=512): | ||
fh.write(chunk) | ||
|
||
# Return a page describing what just happened | ||
self.page_title = "Download complete." | ||
self._set_page_content(f"Downloaded '{url}' to '{download_path}'.") | ||
else: | ||
self.page_title = f"Error - Unsupported Content-Type '{content_type}'" | ||
self._set_page_content(self.page_title) | ||
else: | ||
self.page_title = "Error" | ||
self._set_page_content("Failed to retrieve " + url) | ||
except requests.exceptions.RequestException as e: | ||
self.page_title = "Error" | ||
self._set_page_content(str(e)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here enums too, or at least some configuration so that users can register new text browsers if they implement new ones (like wikimedia servers for example) maybe a better idea is to pass the textbroser instance to the agent factory itself