"""Text processing functions""" from math import ceil from typing import Optional import spacy import tiktoken from autogpt.config import Config from autogpt.llm.base import ChatSequence from autogpt.llm.providers.openai import OPEN_AI_MODELS from autogpt.llm.utils import count_string_tokens, create_chat_completion from autogpt.logs import logger from autogpt.utils import batch CFG = Config() def _max_chunk_length(model: str, max: Optional[int] = None) -> int: model_max_input_tokens = OPEN_AI_MODELS[model].max_tokens - 1 if max is not None and max > 0: return min(max, model_max_input_tokens) return model_max_input_tokens def must_chunk_content( text: str, for_model: str, max_chunk_length: Optional[int] = None ) -> bool: return count_string_tokens(text, for_model) > _max_chunk_length( for_model, max_chunk_length ) def chunk_content( content: str, for_model: str, max_chunk_length: Optional[int] = None, with_overlap=True, ): """Split content into chunks of approximately equal token length.""" MAX_OVERLAP = 200 # limit overlap to save tokens if not must_chunk_content(content, for_model, max_chunk_length): yield content, count_string_tokens(content, for_model) return max_chunk_length = max_chunk_length or _max_chunk_length(for_model) tokenizer = tiktoken.encoding_for_model(for_model) tokenized_text = tokenizer.encode(content) total_length = len(tokenized_text) n_chunks = ceil(total_length / max_chunk_length) chunk_length = ceil(total_length / n_chunks) overlap = min(max_chunk_length - chunk_length, MAX_OVERLAP) if with_overlap else 0 for token_batch in batch(tokenized_text, chunk_length + overlap, overlap): yield tokenizer.decode(token_batch), len(token_batch) def summarize_text( text: str, instruction: Optional[str] = None, question: Optional[str] = None ) -> tuple[str, None | list[tuple[str, str]]]: """Summarize text using the OpenAI API Args: text (str): The text to summarize instruction (str): Additional instruction for summarization, e.g. "focus on information related to polar bears", "omit personal information contained in the text" Returns: str: The summary of the text list[(summary, chunk)]: Text chunks and their summary, if the text was chunked. None otherwise. """ if not text: raise ValueError("No text to summarize") if instruction and question: raise ValueError("Parameters 'question' and 'instructions' cannot both be set") model = CFG.fast_llm_model if question: instruction = ( f'include any information that can be used to answer the question "{question}". ' "Do not directly answer the question itself" ) summarization_prompt = ChatSequence.for_model(model) token_length = count_string_tokens(text, model) logger.info(f"Text length: {token_length} tokens") # reserve 50 tokens for summary prompt, 500 for the response max_chunk_length = _max_chunk_length(model) - 550 logger.info(f"Max chunk length: {max_chunk_length} tokens") if not must_chunk_content(text, model, max_chunk_length): # summarization_prompt.add("user", text) summarization_prompt.add( "user", "Write a concise summary of the following text" f"{f'; {instruction}' if instruction is not None else ''}:" "\n\n\n" f'LITERAL TEXT: """{text}"""' "\n\n\n" "CONCISE SUMMARY: The text is best summarized as" # "Only respond with a concise summary or description of the user message." ) logger.debug(f"Summarizing with {model}:\n{summarization_prompt.dump()}\n") summary = create_chat_completion( summarization_prompt, temperature=0, max_tokens=500 ) logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n") return summary.strip(), None summaries: list[str] = [] chunks = list(split_text(text, for_model=model, max_chunk_length=max_chunk_length)) for i, (chunk, chunk_length) in enumerate(chunks): logger.info( f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens" ) summary, _ = summarize_text(chunk, instruction) summaries.append(summary) logger.info(f"Summarized {len(chunks)} chunks") summary, _ = summarize_text("\n\n".join(summaries)) return summary.strip(), [ (summaries[i], chunks[i][0]) for i in range(0, len(chunks)) ] def split_text( text: str, for_model: str = CFG.fast_llm_model, with_overlap=True, max_chunk_length: Optional[int] = None, ): """Split text into chunks of sentences, with each chunk not exceeding the maximum length Args: text (str): The text to split for_model (str): The model to chunk for; determines tokenizer and constraints max_length (int, optional): The maximum length of each chunk Yields: str: The next chunk of text Raises: ValueError: when a sentence is longer than the maximum length """ max_length = _max_chunk_length(for_model, max_chunk_length) # flatten paragraphs to improve performance text = text.replace("\n", " ") text_length = count_string_tokens(text, for_model) if text_length < max_length: yield text, text_length return n_chunks = ceil(text_length / max_length) target_chunk_length = ceil(text_length / n_chunks) nlp: spacy.language.Language = spacy.load(CFG.browse_spacy_language_model) nlp.add_pipe("sentencizer") doc = nlp(text) sentences = [sentence.text.strip() for sentence in doc.sents] current_chunk: list[str] = [] current_chunk_length = 0 last_sentence = None last_sentence_length = 0 i = 0 while i < len(sentences): sentence = sentences[i] sentence_length = count_string_tokens(sentence, for_model) expected_chunk_length = current_chunk_length + 1 + sentence_length if ( expected_chunk_length < max_length # try to create chunks of approximately equal size and expected_chunk_length - (sentence_length / 2) < target_chunk_length ): current_chunk.append(sentence) current_chunk_length = expected_chunk_length elif sentence_length < max_length: if last_sentence: yield " ".join(current_chunk), current_chunk_length current_chunk = [] current_chunk_length = 0 if with_overlap: overlap_max_length = max_length - sentence_length - 1 if last_sentence_length < overlap_max_length: current_chunk += [last_sentence] current_chunk_length += last_sentence_length + 1 elif overlap_max_length > 5: # add as much from the end of the last sentence as fits current_chunk += [ list( chunk_content( last_sentence, for_model, overlap_max_length, ) ).pop()[0], ] current_chunk_length += overlap_max_length + 1 current_chunk += [sentence] current_chunk_length += sentence_length else: # sentence longer than maximum length -> chop up and try again sentences[i : i + 1] = [ chunk for chunk, _ in chunk_content(sentence, for_model, target_chunk_length) ] continue i += 1 last_sentence = sentence last_sentence_length = sentence_length if current_chunk: yield " ".join(current_chunk), current_chunk_length