Add files via upload
This commit is contained in:
committed by
GitHub
parent
a70f08ec2c
commit
2c81e4c8f7
72
loader/data_loader.py
Normal file
72
loader/data_loader.py
Normal file
@ -0,0 +1,72 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
DATAS_CONFIG = "./datas.json"
|
||||
|
||||
|
||||
class INGDataLoader():
|
||||
def __init__(self, config_path: str=DATAS_CONFIG) -> None:
|
||||
self._path = config_path
|
||||
with open(config_path, 'r', encoding='utf-8') as config:
|
||||
data = json.load(config)
|
||||
self.top_level_path:str = data["cp_path"]
|
||||
self.datasets:dict = data["datasets"]
|
||||
self.id_table = {
|
||||
v["id"]: k for (k, v) in self.datasets.items()
|
||||
}
|
||||
|
||||
def body_extractor(self, target: str) -> list:
|
||||
if target not in self.datasets:
|
||||
print(f"{target} is not included in datas.json as a dataset")
|
||||
return None
|
||||
configs = self.datasets[target]
|
||||
tag = configs["tag"]
|
||||
body = [] # may get a bit huge...
|
||||
full_path = os.path.join(self.top_level_path, configs["path"])
|
||||
if os.path.isfile(full_path): # single file json
|
||||
with open(full_path, mode='r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
for poem in data:
|
||||
body += poem[tag]
|
||||
return body
|
||||
# a dir, probably with a skip list
|
||||
subpaths = os.listdir(full_path)
|
||||
for filename in subpaths:
|
||||
if filename in configs["excludes"]:
|
||||
continue
|
||||
with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
for poem in data:
|
||||
body += poem[tag]
|
||||
return body
|
||||
|
||||
def extract_from_multiple(self, targets: list) -> list:
|
||||
results = []
|
||||
for target in targets:
|
||||
results += self.body_extractor(target)
|
||||
return results
|
||||
|
||||
def extract_with_ids(self, ids: list) -> list:
|
||||
results = []
|
||||
for id in ids:
|
||||
results += self.body_extractor(
|
||||
self.id_table[id]
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
loader = INGDataLoader()
|
||||
print(loader.id_table)
|
||||
print(
|
||||
loader.body_extractor("wudai-huajianji")[-1]
|
||||
)
|
||||
print(
|
||||
len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"]))
|
||||
)
|
||||
print(
|
||||
loader.extract_with_ids([0, 1, 2])
|
||||
)
|
||||
|
||||
94
loader/datas.json
Normal file
94
loader/datas.json
Normal file
@ -0,0 +1,94 @@
|
||||
{
|
||||
"cp_path": "./data/chinese-poetry/",
|
||||
"datasets": {
|
||||
"wudai-huajianji": {
|
||||
"name": "五代-花间集",
|
||||
"id": 0,
|
||||
"path": "五代诗词/huajianji/",
|
||||
"excludes": ["README.md"],
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"wudai-nantang": {
|
||||
"name": "五代-南唐",
|
||||
"id": 1,
|
||||
"path": "五代诗词/nantang/poetrys.json",
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"yuanqu": {
|
||||
"name": "元曲",
|
||||
"id": 2,
|
||||
"path": "元曲/yuanqu.json",
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"tangsong": {
|
||||
"name": "全唐诗全宋诗",
|
||||
"id": 3,
|
||||
"path": "全唐诗/",
|
||||
"excludes": ["README.md", "表面结构字.json", "error/", "authors.song.json", "authors.tang.json"],
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"mengzi": {
|
||||
"name": "四书五经-孟子",
|
||||
"id": 4,
|
||||
"path": "四书五经/mengzi.json",
|
||||
"tag": "paragraphs",
|
||||
"comments": "四书五经的其他文件不符合两层结构, 以后再想办法处理吧."
|
||||
},
|
||||
"songci": {
|
||||
"name": "宋词",
|
||||
"id": 5,
|
||||
"path": "宋词/",
|
||||
"excludes": ["authors.song.json", "ci.db", "main.py", "README.md", "UpdateCi.py"],
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"youmengying": {
|
||||
"name": "幽梦影-张潮文集",
|
||||
"id": 6,
|
||||
"path": "幽梦影/youmengying.json",
|
||||
"tag": "content"
|
||||
},
|
||||
"yudingquantangshi": {
|
||||
"name": "御定全唐诗",
|
||||
"id": 7,
|
||||
"path": "御定全唐诗/json/",
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"caocao": {
|
||||
"name": "曹操诗集",
|
||||
"id": 8,
|
||||
"path": "曹操诗集/caocao.json",
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"chuci": {
|
||||
"name": "楚辞",
|
||||
"id": 9,
|
||||
"path": "楚辞/chuci.json",
|
||||
"tag": "content"
|
||||
},
|
||||
"shuimotangshi": {
|
||||
"name": "水墨唐诗",
|
||||
"id": 10,
|
||||
"path": "水墨唐诗/shuimotangshi.json",
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"nalanxingde": {
|
||||
"name": "纳兰性德",
|
||||
"id": 11,
|
||||
"path": "纳兰性德/纳兰性德诗集.json",
|
||||
"tag": "para",
|
||||
"comments": "蒙学文件夹下的非常规格式文件没有包括在这个json中"
|
||||
},
|
||||
"lunyu": {
|
||||
"name": "论语",
|
||||
"id": 12,
|
||||
"path": "论语/lunyu.json",
|
||||
"tag": "paragraphs"
|
||||
},
|
||||
"shijing": {
|
||||
"name": "诗经",
|
||||
"id": 13,
|
||||
"path": "诗经/shijing.json",
|
||||
"tag": "content"
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user