Files
chinese-poetry/loader/data_loader.py
SanBingYouYong-S.Zhang 1fc8091913 Update data_loader.py
2023-05-10 17:33:07 +01:00

73 lines
2.3 KiB
Python

import json
import os
DATAS_CONFIG = "./loader/datas.json"
class INGDataLoader():
def __init__(self, config_path: str=DATAS_CONFIG) -> None:
self._path = config_path
with open(config_path, 'r', encoding='utf-8') as config:
data = json.load(config)
self.top_level_path:str = data["cp_path"]
self.datasets:dict = data["datasets"]
self.id_table = {
v["id"]: k for (k, v) in self.datasets.items()
}
def body_extractor(self, target: str) -> list:
if target not in self.datasets:
print(f"{target} is not included in datas.json as a dataset")
return None
configs = self.datasets[target]
tag = configs["tag"]
body = [] # may get a bit huge...
full_path = os.path.join(self.top_level_path, configs["path"])
if os.path.isfile(full_path): # single file json
with open(full_path, mode='r', encoding='utf-8') as file:
data = json.load(file)
for poem in data:
body += poem[tag]
return body
# a dir, probably with a skip list
subpaths = os.listdir(full_path)
for filename in subpaths:
if filename in configs["excludes"]:
continue
with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
data = json.load(file)
for poem in data:
body += poem[tag]
return body
def extract_from_multiple(self, targets: list) -> list:
results = []
for target in targets:
results += self.body_extractor(target)
return results
def extract_with_ids(self, ids: list) -> list:
results = []
for id in ids:
results += self.body_extractor(
self.id_table[id]
)
return results
if __name__ == "__main__":
loader = INGDataLoader()
print(loader.id_table)
print(
loader.body_extractor("wudai-huajianji")[-1]
)
print(
len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"]))
)
print(
loader.extract_with_ids([0, 1, 2])
)