73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
import json
|
|
import os
|
|
|
|
|
|
DATAS_CONFIG = "./datas.json"
|
|
|
|
|
|
class INGDataLoader():
|
|
def __init__(self, config_path: str=DATAS_CONFIG) -> None:
|
|
self._path = config_path
|
|
with open(config_path, 'r', encoding='utf-8') as config:
|
|
data = json.load(config)
|
|
self.top_level_path:str = data["cp_path"]
|
|
self.datasets:dict = data["datasets"]
|
|
self.id_table = {
|
|
v["id"]: k for (k, v) in self.datasets.items()
|
|
}
|
|
|
|
def body_extractor(self, target: str) -> list:
|
|
if target not in self.datasets:
|
|
print(f"{target} is not included in datas.json as a dataset")
|
|
return None
|
|
configs = self.datasets[target]
|
|
tag = configs["tag"]
|
|
body = [] # may get a bit huge...
|
|
full_path = os.path.join(self.top_level_path, configs["path"])
|
|
if os.path.isfile(full_path): # single file json
|
|
with open(full_path, mode='r', encoding='utf-8') as file:
|
|
data = json.load(file)
|
|
for poem in data:
|
|
body += poem[tag]
|
|
return body
|
|
# a dir, probably with a skip list
|
|
subpaths = os.listdir(full_path)
|
|
for filename in subpaths:
|
|
if filename in configs["excludes"]:
|
|
continue
|
|
with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
|
|
data = json.load(file)
|
|
for poem in data:
|
|
body += poem[tag]
|
|
return body
|
|
|
|
def extract_from_multiple(self, targets: list) -> list:
|
|
results = []
|
|
for target in targets:
|
|
results += self.body_extractor(target)
|
|
return results
|
|
|
|
def extract_with_ids(self, ids: list) -> list:
|
|
results = []
|
|
for id in ids:
|
|
results += self.body_extractor(
|
|
self.id_table[id]
|
|
)
|
|
return results
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
loader = INGDataLoader()
|
|
print(loader.id_table)
|
|
print(
|
|
loader.body_extractor("wudai-huajianji")[-1]
|
|
)
|
|
print(
|
|
len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"]))
|
|
)
|
|
print(
|
|
loader.extract_with_ids([0, 1, 2])
|
|
)
|
|
|