feature(read pdf paper then write summary):

add a func called readPdf in toolbox, which can read pdf paper to str. then use bs4.BeautifulSoup to clean content.
2023-03-31 00:54:01 +08:00
parent 2d5d719696
commit 5521f0e41c
4 changed files with 80 additions and 11 deletions
--- a/toolbox.py
+++ b/toolbox.py
@ -1,6 +1,14 @@
 import markdown, mdtex2html, threading, importlib, traceback
 from show_math import convert as convert_math
 from functools import wraps
+import pdfminer
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.layout import LAParams
+from pdfminer.converter import PDFPageAggregator

 def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
    """
@ -235,4 +243,52 @@ def clear_line_break(txt):
    txt = txt.replace('\n', ' ')
    txt = txt.replace('  ', ' ')
    txt = txt.replace('  ', ' ')
-    return txt
+    return txt
+
+def readPdf(pdfPath):
+    """
+    读取pdf文件，返回文本内容
+    """
+    fp = open(pdfPath, 'rb')
+
+    # Create a PDF parser object associated with the file object
+    parser = PDFParser(fp)
+
+    # Create a PDF document object that stores the document structure.
+    # Password for initialization as 2nd parameter
+    document = PDFDocument(parser)
+    # Check if the document allows text extraction. If not, abort.
+    if not document.is_extractable:
+        raise PDFTextExtractionNotAllowed
+
+    # Create a PDF resource manager object that stores shared resources.
+    rsrcmgr = PDFResourceManager()
+
+    # Create a PDF device object.
+    # device = PDFDevice(rsrcmgr)
+
+    # BEGIN LAYOUT ANALYSIS.
+    # Set parameters for analysis.
+    laparams = LAParams(
+        char_margin=10.0,
+        line_margin=0.2,
+        boxes_flow=0.2,
+        all_texts=False,
+    )
+    # Create a PDF page aggregator object.
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    # Create a PDF interpreter object.
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+
+    # loop over all pages in the document
+    outTextList = []
+    for page in PDFPage.create_pages(document):
+        # read the page into a layout object
+        interpreter.process_page(page)
+        layout = device.get_result()
+        for obj in layout._objs:
+            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
+                # print(obj.get_text())
+                outTextList.append(obj.get_text())
+
+    return outTextList