autogpt 还原,开新分支编写

This commit is contained in:
w_xiaolizu
2023-05-30 16:38:01 +08:00
parent 548532e522
commit 519df19f43
65 changed files with 3352 additions and 1868 deletions

View File

@ -1,234 +1,174 @@
"""Text processing functions"""
from math import ceil
from typing import Optional
from typing import Dict, Generator, Optional
import spacy
import tiktoken
from selenium.webdriver.remote.webdriver import WebDriver
from autogpt import token_counter
from autogpt.config import Config
from autogpt.llm.base import ChatSequence
from autogpt.llm.providers.openai import OPEN_AI_MODELS
from autogpt.llm.utils import count_string_tokens, create_chat_completion
from autogpt.logs import logger
from autogpt.utils import batch
from autogpt.llm_utils import create_chat_completion
from autogpt.memory import get_memory
CFG = Config()
def _max_chunk_length(model: str, max: Optional[int] = None) -> int:
model_max_input_tokens = OPEN_AI_MODELS[model].max_tokens - 1
if max is not None and max > 0:
return min(max, model_max_input_tokens)
return model_max_input_tokens
def must_chunk_content(
text: str, for_model: str, max_chunk_length: Optional[int] = None
) -> bool:
return count_string_tokens(text, for_model) > _max_chunk_length(
for_model, max_chunk_length
)
def chunk_content(
content: str,
for_model: str,
max_chunk_length: Optional[int] = None,
with_overlap=True,
):
"""Split content into chunks of approximately equal token length."""
MAX_OVERLAP = 200 # limit overlap to save tokens
if not must_chunk_content(content, for_model, max_chunk_length):
yield content, count_string_tokens(content, for_model)
return
max_chunk_length = max_chunk_length or _max_chunk_length(for_model)
tokenizer = tiktoken.encoding_for_model(for_model)
tokenized_text = tokenizer.encode(content)
total_length = len(tokenized_text)
n_chunks = ceil(total_length / max_chunk_length)
chunk_length = ceil(total_length / n_chunks)
overlap = min(max_chunk_length - chunk_length, MAX_OVERLAP) if with_overlap else 0
for token_batch in batch(tokenized_text, chunk_length + overlap, overlap):
yield tokenizer.decode(token_batch), len(token_batch)
def summarize_text(
text: str, instruction: Optional[str] = None, question: Optional[str] = None
) -> tuple[str, None | list[tuple[str, str]]]:
"""Summarize text using the OpenAI API
Args:
text (str): The text to summarize
instruction (str): Additional instruction for summarization, e.g. "focus on information related to polar bears", "omit personal information contained in the text"
Returns:
str: The summary of the text
list[(summary, chunk)]: Text chunks and their summary, if the text was chunked.
None otherwise.
"""
if not text:
raise ValueError("No text to summarize")
if instruction and question:
raise ValueError("Parameters 'question' and 'instructions' cannot both be set")
model = CFG.fast_llm_model
if question:
instruction = (
f'include any information that can be used to answer the question "{question}". '
"Do not directly answer the question itself"
)
summarization_prompt = ChatSequence.for_model(model)
token_length = count_string_tokens(text, model)
logger.info(f"Text length: {token_length} tokens")
# reserve 50 tokens for summary prompt, 500 for the response
max_chunk_length = _max_chunk_length(model) - 550
logger.info(f"Max chunk length: {max_chunk_length} tokens")
if not must_chunk_content(text, model, max_chunk_length):
# summarization_prompt.add("user", text)
summarization_prompt.add(
"user",
"Write a concise summary of the following text"
f"{f'; {instruction}' if instruction is not None else ''}:"
"\n\n\n"
f'LITERAL TEXT: """{text}"""'
"\n\n\n"
"CONCISE SUMMARY: The text is best summarized as"
# "Only respond with a concise summary or description of the user message."
)
logger.debug(f"Summarizing with {model}:\n{summarization_prompt.dump()}\n")
summary = create_chat_completion(
summarization_prompt, temperature=0, max_tokens=500
)
logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n")
return summary.strip(), None
summaries: list[str] = []
chunks = list(split_text(text, for_model=model, max_chunk_length=max_chunk_length))
for i, (chunk, chunk_length) in enumerate(chunks):
logger.info(
f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens"
)
summary, _ = summarize_text(chunk, instruction)
summaries.append(summary)
logger.info(f"Summarized {len(chunks)} chunks")
summary, _ = summarize_text("\n\n".join(summaries))
return summary.strip(), [
(summaries[i], chunks[i][0]) for i in range(0, len(chunks))
]
def split_text(
text: str,
for_model: str = CFG.fast_llm_model,
with_overlap=True,
max_chunk_length: Optional[int] = None,
):
"""Split text into chunks of sentences, with each chunk not exceeding the maximum length
max_length: int = CFG.browse_chunk_max_length,
model: str = CFG.fast_llm_model,
question: str = "",
) -> Generator[str, None, None]:
"""Split text into chunks of a maximum length
Args:
text (str): The text to split
for_model (str): The model to chunk for; determines tokenizer and constraints
max_length (int, optional): The maximum length of each chunk
max_length (int, optional): The maximum length of each chunk. Defaults to 8192.
Yields:
str: The next chunk of text
Raises:
ValueError: when a sentence is longer than the maximum length
ValueError: If the text is longer than the maximum length
"""
max_length = _max_chunk_length(for_model, max_chunk_length)
# flatten paragraphs to improve performance
text = text.replace("\n", " ")
text_length = count_string_tokens(text, for_model)
if text_length < max_length:
yield text, text_length
return
n_chunks = ceil(text_length / max_length)
target_chunk_length = ceil(text_length / n_chunks)
nlp: spacy.language.Language = spacy.load(CFG.browse_spacy_language_model)
flatened_paragraphs = " ".join(text.split("\n"))
nlp = spacy.load(CFG.browse_spacy_language_model)
nlp.add_pipe("sentencizer")
doc = nlp(text)
sentences = [sentence.text.strip() for sentence in doc.sents]
doc = nlp(flatened_paragraphs)
sentences = [sent.text.strip() for sent in doc.sents]
current_chunk: list[str] = []
current_chunk_length = 0
last_sentence = None
last_sentence_length = 0
current_chunk = []
i = 0
while i < len(sentences):
sentence = sentences[i]
sentence_length = count_string_tokens(sentence, for_model)
expected_chunk_length = current_chunk_length + 1 + sentence_length
for sentence in sentences:
message_with_additional_sentence = [
create_message(" ".join(current_chunk) + " " + sentence, question)
]
if (
expected_chunk_length < max_length
# try to create chunks of approximately equal size
and expected_chunk_length - (sentence_length / 2) < target_chunk_length
):
expected_token_usage = (
token_usage_of_chunk(messages=message_with_additional_sentence, model=model)
+ 1
)
if expected_token_usage <= max_length:
current_chunk.append(sentence)
current_chunk_length = expected_chunk_length
elif sentence_length < max_length:
if last_sentence:
yield " ".join(current_chunk), current_chunk_length
current_chunk = []
current_chunk_length = 0
if with_overlap:
overlap_max_length = max_length - sentence_length - 1
if last_sentence_length < overlap_max_length:
current_chunk += [last_sentence]
current_chunk_length += last_sentence_length + 1
elif overlap_max_length > 5:
# add as much from the end of the last sentence as fits
current_chunk += [
list(
chunk_content(
last_sentence,
for_model,
overlap_max_length,
)
).pop()[0],
]
current_chunk_length += overlap_max_length + 1
current_chunk += [sentence]
current_chunk_length += sentence_length
else: # sentence longer than maximum length -> chop up and try again
sentences[i : i + 1] = [
chunk
for chunk, _ in chunk_content(sentence, for_model, target_chunk_length)
else:
yield " ".join(current_chunk)
current_chunk = [sentence]
message_this_sentence_only = [
create_message(" ".join(current_chunk), question)
]
continue
i += 1
last_sentence = sentence
last_sentence_length = sentence_length
expected_token_usage = (
token_usage_of_chunk(messages=message_this_sentence_only, model=model)
+ 1
)
if expected_token_usage > max_length:
raise ValueError(
f"Sentence is too long in webpage: {expected_token_usage} tokens."
)
if current_chunk:
yield " ".join(current_chunk), current_chunk_length
yield " ".join(current_chunk)
def token_usage_of_chunk(messages, model):
return token_counter.count_message_tokens(messages, model)
def summarize_text(
url: str, text: str, question: str, driver: Optional[WebDriver] = None
) -> str:
"""Summarize text using the OpenAI API
Args:
url (str): The url of the text
text (str): The text to summarize
question (str): The question to ask the model
driver (WebDriver): The webdriver to use to scroll the page
Returns:
str: The summary of the text
"""
if not text:
return "Error: No text to summarize"
model = CFG.fast_llm_model
text_length = len(text)
print(f"Text length: {text_length} characters")
summaries = []
chunks = list(
split_text(
text, max_length=CFG.browse_chunk_max_length, model=model, question=question
),
)
scroll_ratio = 1 / len(chunks)
for i, chunk in enumerate(chunks):
if driver:
scroll_to_percentage(driver, scroll_ratio * i)
print(f"Adding chunk {i + 1} / {len(chunks)} to memory")
memory_to_add = f"Source: {url}\n" f"Raw content part#{i + 1}: {chunk}"
memory = get_memory(CFG)
memory.add(memory_to_add)
messages = [create_message(chunk, question)]
tokens_for_chunk = token_counter.count_message_tokens(messages, model)
print(
f"Summarizing chunk {i + 1} / {len(chunks)} of length {len(chunk)} characters, or {tokens_for_chunk} tokens"
)
summary = create_chat_completion(
model=model,
messages=messages,
)
summaries.append(summary)
print(
f"Added chunk {i + 1} summary to memory, of length {len(summary)} characters"
)
memory_to_add = f"Source: {url}\n" f"Content summary part#{i + 1}: {summary}"
memory.add(memory_to_add)
print(f"Summarized {len(chunks)} chunks.")
combined_summary = "\n".join(summaries)
messages = [create_message(combined_summary, question)]
return create_chat_completion(
model=model,
messages=messages,
)
def scroll_to_percentage(driver: WebDriver, ratio: float) -> None:
"""Scroll to a percentage of the page
Args:
driver (WebDriver): The webdriver to use
ratio (float): The percentage to scroll to
Raises:
ValueError: If the ratio is not between 0 and 1
"""
if ratio < 0 or ratio > 1:
raise ValueError("Percentage should be between 0 and 1")
driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {ratio});")
def create_message(chunk: str, question: str) -> Dict[str, str]:
"""Create a message for the chat completion
Args:
chunk (str): The chunk of text to summarize
question (str): The question to answer
Returns:
Dict[str, str]: The message to send to the chat completion
"""
return {
"role": "user",
"content": f'"""{chunk}""" Using the above text, answer the following'
f' question: "{question}" -- if the question cannot be answered using the text,'
" summarize the text.",
}