diff --git a/loader/data_loader.py b/loader/data_loader.py new file mode 100644 index 0000000..6b17f9b --- /dev/null +++ b/loader/data_loader.py @@ -0,0 +1,72 @@ +import json +import os + + +DATAS_CONFIG = "./datas.json" + + +class INGDataLoader(): + def __init__(self, config_path: str=DATAS_CONFIG) -> None: + self._path = config_path + with open(config_path, 'r', encoding='utf-8') as config: + data = json.load(config) + self.top_level_path:str = data["cp_path"] + self.datasets:dict = data["datasets"] + self.id_table = { + v["id"]: k for (k, v) in self.datasets.items() + } + + def body_extractor(self, target: str) -> list: + if target not in self.datasets: + print(f"{target} is not included in datas.json as a dataset") + return None + configs = self.datasets[target] + tag = configs["tag"] + body = [] # may get a bit huge... + full_path = os.path.join(self.top_level_path, configs["path"]) + if os.path.isfile(full_path): # single file json + with open(full_path, mode='r', encoding='utf-8') as file: + data = json.load(file) + for poem in data: + body += poem[tag] + return body + # a dir, probably with a skip list + subpaths = os.listdir(full_path) + for filename in subpaths: + if filename in configs["excludes"]: + continue + with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file: + data = json.load(file) + for poem in data: + body += poem[tag] + return body + + def extract_from_multiple(self, targets: list) -> list: + results = [] + for target in targets: + results += self.body_extractor(target) + return results + + def extract_with_ids(self, ids: list) -> list: + results = [] + for id in ids: + results += self.body_extractor( + self.id_table[id] + ) + return results + + + +if __name__ == "__main__": + loader = INGDataLoader() + print(loader.id_table) + print( + loader.body_extractor("wudai-huajianji")[-1] + ) + print( + len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"])) + ) + print( + loader.extract_with_ids([0, 1, 2]) + ) + diff --git a/loader/datas.json b/loader/datas.json new file mode 100644 index 0000000..6a2477f --- /dev/null +++ b/loader/datas.json @@ -0,0 +1,94 @@ +{ + "cp_path": "./data/chinese-poetry/", + "datasets": { + "wudai-huajianji": { + "name": "五代-花间集", + "id": 0, + "path": "五代诗词/huajianji/", + "excludes": ["README.md"], + "tag": "paragraphs" + }, + "wudai-nantang": { + "name": "五代-南唐", + "id": 1, + "path": "五代诗词/nantang/poetrys.json", + "tag": "paragraphs" + }, + "yuanqu": { + "name": "元曲", + "id": 2, + "path": "元曲/yuanqu.json", + "tag": "paragraphs" + }, + "tangsong": { + "name": "全唐诗全宋诗", + "id": 3, + "path": "全唐诗/", + "excludes": ["README.md", "表面结构字.json", "error/", "authors.song.json", "authors.tang.json"], + "tag": "paragraphs" + }, + "mengzi": { + "name": "四书五经-孟子", + "id": 4, + "path": "四书五经/mengzi.json", + "tag": "paragraphs", + "comments": "四书五经的其他文件不符合两层结构, 以后再想办法处理吧." + }, + "songci": { + "name": "宋词", + "id": 5, + "path": "宋词/", + "excludes": ["authors.song.json", "ci.db", "main.py", "README.md", "UpdateCi.py"], + "tag": "paragraphs" + }, + "youmengying": { + "name": "幽梦影-张潮文集", + "id": 6, + "path": "幽梦影/youmengying.json", + "tag": "content" + }, + "yudingquantangshi": { + "name": "御定全唐诗", + "id": 7, + "path": "御定全唐诗/json/", + "tag": "paragraphs" + }, + "caocao": { + "name": "曹操诗集", + "id": 8, + "path": "曹操诗集/caocao.json", + "tag": "paragraphs" + }, + "chuci": { + "name": "楚辞", + "id": 9, + "path": "楚辞/chuci.json", + "tag": "content" + }, + "shuimotangshi": { + "name": "水墨唐诗", + "id": 10, + "path": "水墨唐诗/shuimotangshi.json", + "tag": "paragraphs" + }, + "nalanxingde": { + "name": "纳兰性德", + "id": 11, + "path": "纳兰性德/纳兰性德诗集.json", + "tag": "para", + "comments": "蒙学文件夹下的非常规格式文件没有包括在这个json中" + }, + "lunyu": { + "name": "论语", + "id": 12, + "path": "论语/lunyu.json", + "tag": "paragraphs" + }, + "shijing": { + "name": "诗经", + "id": 13, + "path": "诗经/shijing.json", + "tag": "content" + } + } +} \ No newline at end of file