From fbabefcd0636e19122acc141d8a1c6c0d388d476 Mon Sep 17 00:00:00 2001 From: chienmy Date: Sat, 30 Apr 2022 16:53:25 +0800 Subject: [PATCH] =?UTF-8?q?Add:=20=E6=9B=B4=E6=AD=A3=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ci/UpdateCi.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 ci/UpdateCi.py diff --git a/ci/UpdateCi.py b/ci/UpdateCi.py new file mode 100644 index 0000000..7dc4be3 --- /dev/null +++ b/ci/UpdateCi.py @@ -0,0 +1,103 @@ +import json +import logging +import os +import re +from difflib import SequenceMatcher + +import requests +from bs4 import BeautifulSoup +from bs4.element import NavigableString + + +def get_page_content(page: int) -> list: + """ 获取目录页每一页的内容 """ + content = [] + r = requests.post("http://qsc.zww.cn/getdata.asp", data={ + "seektype": 2, + "seekvalue": "", + "pageno": page + }) + r.encoding = "gbk" + soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml") + for i, a in enumerate(soup.find_all(name="a")): + if i % 2 == 0: + content.append({ + "rhythmic": a.string.split("(")[0], + "param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",") + }) + else: + content[-1]["author"] = a.string + for c in content: + c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1])) + del c["param"] + return content + + +def get_paragraphs(seek_type: int, seek_value: int) -> list: + """ 获取词的内容段落 """ + paragraphs = [] + r = requests.post("http://qsc.zww.cn/getdata.asp", data={ + "seektype": seek_type, + "seekvalue": seek_value, + "pageno": 1 + }) + r.encoding = "gbk" + soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml") + for child in soup.find(name="p", align=None).contents: + if isinstance(child, NavigableString): + paragraphs.append(child) + return paragraphs + + +def get_all_page(temp_file: str): + """ 爬取数据并保存至临时文件 """ + for page in range(1, 1240): + all_data.extend(get_page_content(page)) + logging.info("Success: save page {0}".format(page)) + with open(temp_file, "w", encoding="utf-8") as f: + f.write(json.dumps(all_data, indent=2, ensure_ascii=False)) + + +def only_text(text: str): + """ 去除标点只保留文字 """ + return re.sub(r"[,。、《》…()·・\s]", "", text) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s") + temp_file_name = "all.json" + # 临时文件不存在则先爬取 + if not os.path.exists(temp_file_name): + get_all_page(temp_file_name) + # 读取临时文件 + with open("all.json", "r", encoding="utf-8") as f: + all_data = json.load(f) + # 统计更正的数目 + diff_num = 0 + # 遍历当前目录 + for file_name in os.listdir("./"): + if re.match(r"ci\.song\.\d+\.json", file_name): + # 每个文件开始的数据索引 + start = int(file_name.split(".")[2]) + with open(file_name, "r", encoding="utf-8") as f: + file_data = json.load(f) + for i in range(len(file_data)): + old_text = only_text("".join(file_data[i]["paragraphs"])) + new_text = only_text("".join(all_data[start + i]["paragraphs"])) + # 计算纯文字的相似度 + ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio() + if 0.9 <= ratio < 1.0: + # 假定此范围内说明缺字,需要更新 + diff_num += 1 + file_data[i]["author"] = all_data[start + i]["author"] + file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"] + elif ratio < 0.9: + # 异常情况warning输出,不更新 + logging.warning(old_text) + logging.warning(new_text) + # 保存数据,原文件中逗号后有空格,这里保持一致 + with open(file_name, "w", encoding="utf-8") as f: + f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", ")) + logging.info("Save " + file_name) + logging.info("Change {0} items".format(diff_num))