Add files via upload

This commit is contained in:
SanBingYouYong-S.Zhang
2023-05-10 17:31:15 +01:00
committed by GitHub
parent a70f08ec2c
commit 2c81e4c8f7
2 changed files with 166 additions and 0 deletions

72
loader/data_loader.py Normal file
View File

@ -0,0 +1,72 @@
import json
import os
DATAS_CONFIG = "./datas.json"
class INGDataLoader():
def __init__(self, config_path: str=DATAS_CONFIG) -> None:
self._path = config_path
with open(config_path, 'r', encoding='utf-8') as config:
data = json.load(config)
self.top_level_path:str = data["cp_path"]
self.datasets:dict = data["datasets"]
self.id_table = {
v["id"]: k for (k, v) in self.datasets.items()
}
def body_extractor(self, target: str) -> list:
if target not in self.datasets:
print(f"{target} is not included in datas.json as a dataset")
return None
configs = self.datasets[target]
tag = configs["tag"]
body = [] # may get a bit huge...
full_path = os.path.join(self.top_level_path, configs["path"])
if os.path.isfile(full_path): # single file json
with open(full_path, mode='r', encoding='utf-8') as file:
data = json.load(file)
for poem in data:
body += poem[tag]
return body
# a dir, probably with a skip list
subpaths = os.listdir(full_path)
for filename in subpaths:
if filename in configs["excludes"]:
continue
with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
data = json.load(file)
for poem in data:
body += poem[tag]
return body
def extract_from_multiple(self, targets: list) -> list:
results = []
for target in targets:
results += self.body_extractor(target)
return results
def extract_with_ids(self, ids: list) -> list:
results = []
for id in ids:
results += self.body_extractor(
self.id_table[id]
)
return results
if __name__ == "__main__":
loader = INGDataLoader()
print(loader.id_table)
print(
loader.body_extractor("wudai-huajianji")[-1]
)
print(
len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"]))
)
print(
loader.extract_with_ids([0, 1, 2])
)

94
loader/datas.json Normal file
View File

@ -0,0 +1,94 @@
{
"cp_path": "./data/chinese-poetry/",
"datasets": {
"wudai-huajianji": {
"name": "五代-花间集",
"id": 0,
"path": "五代诗词/huajianji/",
"excludes": ["README.md"],
"tag": "paragraphs"
},
"wudai-nantang": {
"name": "五代-南唐",
"id": 1,
"path": "五代诗词/nantang/poetrys.json",
"tag": "paragraphs"
},
"yuanqu": {
"name": "元曲",
"id": 2,
"path": "元曲/yuanqu.json",
"tag": "paragraphs"
},
"tangsong": {
"name": "全唐诗全宋诗",
"id": 3,
"path": "全唐诗/",
"excludes": ["README.md", "表面结构字.json", "error/", "authors.song.json", "authors.tang.json"],
"tag": "paragraphs"
},
"mengzi": {
"name": "四书五经-孟子",
"id": 4,
"path": "四书五经/mengzi.json",
"tag": "paragraphs",
"comments": "四书五经的其他文件不符合两层结构, 以后再想办法处理吧."
},
"songci": {
"name": "宋词",
"id": 5,
"path": "宋词/",
"excludes": ["authors.song.json", "ci.db", "main.py", "README.md", "UpdateCi.py"],
"tag": "paragraphs"
},
"youmengying": {
"name": "幽梦影-张潮文集",
"id": 6,
"path": "幽梦影/youmengying.json",
"tag": "content"
},
"yudingquantangshi": {
"name": "御定全唐诗",
"id": 7,
"path": "御定全唐诗/json/",
"tag": "paragraphs"
},
"caocao": {
"name": "曹操诗集",
"id": 8,
"path": "曹操诗集/caocao.json",
"tag": "paragraphs"
},
"chuci": {
"name": "楚辞",
"id": 9,
"path": "楚辞/chuci.json",
"tag": "content"
},
"shuimotangshi": {
"name": "水墨唐诗",
"id": 10,
"path": "水墨唐诗/shuimotangshi.json",
"tag": "paragraphs"
},
"nalanxingde": {
"name": "纳兰性德",
"id": 11,
"path": "纳兰性德/纳兰性德诗集.json",
"tag": "para",
"comments": "蒙学文件夹下的非常规格式文件没有包括在这个json中"
},
"lunyu": {
"name": "论语",
"id": 12,
"path": "论语/lunyu.json",
"tag": "paragraphs"
},
"shijing": {
"name": "诗经",
"id": 13,
"path": "诗经/shijing.json",
"tag": "content"
}
}
}