更新autogptadd

2023-05-30 15:48:14 +08:00
parent cfa885a04e
commit 548532e522
78 changed files with 7706 additions and 0 deletions
--- a/autogpt/commands/web_selenium.py
+++ b/autogpt/commands/web_selenium.py
@ -0,0 +1,232 @@
+"""Selenium web scraping module."""
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from sys import platform
+from typing import TYPE_CHECKING, Optional, Type
+
+from bs4 import BeautifulSoup
+from selenium.common.exceptions import WebDriverException
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.chrome.service import Service as ChromeDriverService
+from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.edge.options import Options as EdgeOptions
+from selenium.webdriver.edge.service import Service as EdgeDriverService
+from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.firefox.service import Service as GeckoDriverService
+from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver
+from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.safari.options import Options as SafariOptions
+from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager
+from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
+
+from autogpt.commands.command import command
+from autogpt.logs import logger
+from autogpt.memory.vector import MemoryItem, get_memory
+from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
+from autogpt.url_utils.validators import validate_url
+
+if TYPE_CHECKING:
+    from autogpt.config import Config
+
+BrowserOptions = ChromeOptions | EdgeOptions | FirefoxOptions | SafariOptions
+
+FILE_DIR = Path(__file__).parent.parent
+
+
+@command(
+    "browse_website",
+    "Browse Website",
+    '"url": "<url>", "question": "<what_you_want_to_find_on_website>"',
+)
+@validate_url
+def browse_website(url: str, question: str, config: Config) -> str:
+    """Browse a website and return the answer and links to the user
+
+    Args:
+        url (str): The url of the website to browse
+        question (str): The question asked by the user
+
+    Returns:
+        Tuple[str, WebDriver]: The answer and links to the user and the webdriver
+    """
+    try:
+        driver, text = scrape_text_with_selenium(url, config)
+    except WebDriverException as e:
+        # These errors are often quite long and include lots of context.
+        # Just grab the first line.
+        msg = e.msg.split("\n")[0]
+        return f"Error: {msg}"
+
+    add_header(driver)
+    summary = summarize_memorize_webpage(url, text, question, config, driver)
+    links = scrape_links_with_selenium(driver, url)
+
+    # Limit links to 5
+    if len(links) > 5:
+        links = links[:5]
+    close_browser(driver)
+    return f"Answer gathered from website: {summary}\n\nLinks: {links}"
+
+
+def scrape_text_with_selenium(url: str, config: Config) -> tuple[WebDriver, str]:
+    """Scrape text from a website using selenium
+
+    Args:
+        url (str): The url of the website to scrape
+
+    Returns:
+        Tuple[WebDriver, str]: The webdriver and the text scraped from the website
+    """
+    logging.getLogger("selenium").setLevel(logging.CRITICAL)
+
+    options_available: dict[str, Type[BrowserOptions]] = {
+        "chrome": ChromeOptions,
+        "edge": EdgeOptions,
+        "firefox": FirefoxOptions,
+        "safari": SafariOptions,
+    }
+
+    options: BrowserOptions = options_available[config.selenium_web_browser]()
+    options.add_argument(
+        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36"
+    )
+
+    if config.selenium_web_browser == "firefox":
+        if config.selenium_headless:
+            options.headless = True
+            options.add_argument("--disable-gpu")
+        driver = FirefoxDriver(
+            service=GeckoDriverService(GeckoDriverManager().install()), options=options
+        )
+    elif config.selenium_web_browser == "edge":
+        driver = EdgeDriver(
+            service=EdgeDriverService(EdgeDriverManager().install()), options=options
+        )
+    elif config.selenium_web_browser == "safari":
+        # Requires a bit more setup on the users end
+        # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari
+        driver = SafariDriver(options=options)
+    else:
+        if platform == "linux" or platform == "linux2":
+            options.add_argument("--disable-dev-shm-usage")
+            options.add_argument("--remote-debugging-port=9222")
+
+        options.add_argument("--no-sandbox")
+        if config.selenium_headless:
+            options.add_argument("--headless=new")
+            options.add_argument("--disable-gpu")
+
+        chromium_driver_path = Path("/usr/bin/chromedriver")
+
+        driver = ChromeDriver(
+            service=ChromeDriverService(str(chromium_driver_path))
+            if chromium_driver_path.exists()
+            else ChromeDriverService(ChromeDriverManager().install()),
+            options=options,
+        )
+    driver.get(url)
+
+    WebDriverWait(driver, 10).until(
+        EC.presence_of_element_located((By.TAG_NAME, "body"))
+    )
+
+    # Get the HTML content directly from the browser's DOM
+    page_source = driver.execute_script("return document.body.outerHTML;")
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = "\n".join(chunk for chunk in chunks if chunk)
+    return driver, text
+
+
+def scrape_links_with_selenium(driver: WebDriver, url: str) -> list[str]:
+    """Scrape links from a website using selenium
+
+    Args:
+        driver (WebDriver): The webdriver to use to scrape the links
+
+    Returns:
+        List[str]: The links scraped from the website
+    """
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    hyperlinks = extract_hyperlinks(soup, url)
+
+    return format_hyperlinks(hyperlinks)
+
+
+def close_browser(driver: WebDriver) -> None:
+    """Close the browser
+
+    Args:
+        driver (WebDriver): The webdriver to close
+
+    Returns:
+        None
+    """
+    driver.quit()
+
+
+def add_header(driver: WebDriver) -> None:
+    """Add a header to the website
+
+    Args:
+        driver (WebDriver): The webdriver to use to add the header
+
+    Returns:
+        None
+    """
+    try:
+        with open(f"{FILE_DIR}/js/overlay.js", "r") as overlay_file:
+            overlay_script = overlay_file.read()
+        driver.execute_script(overlay_script)
+    except Exception as e:
+        print(f"Error executing overlay.js: {e}")
+
+
+def summarize_memorize_webpage(
+    url: str,
+    text: str,
+    question: str,
+    config: Config,
+    driver: Optional[WebDriver] = None,
+) -> str:
+    """Summarize text using the OpenAI API
+
+    Args:
+        url (str): The url of the text
+        text (str): The text to summarize
+        question (str): The question to ask the model
+        driver (WebDriver): The webdriver to use to scroll the page
+
+    Returns:
+        str: The summary of the text
+    """
+    if not text:
+        return "Error: No text to summarize"
+
+    text_length = len(text)
+    logger.info(f"Text length: {text_length} characters")
+
+    memory = get_memory(config)
+
+    new_memory = MemoryItem.from_webpage(text, url, question=question)
+    memory.add(new_memory)
+    return new_memory.summary