import os # can be used to add metadata to the index, for instance URL metadata_by_file_path = { "data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"}, "data/Confucianism/Analects of Confucius.pdf": { "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" }, } def get_domains(): domains = [] for root, dirs, files in os.walk("data"): for dir in dirs: domains.append(dir) return domains def get_sources(): res = [] for root, dirs, files in os.walk("data"): for file in files: if file.endswith(".pdf"): file_path = os.path.join(root, file) print("file_path", file_path) res.append( { "domain": parse_domain(file_path), "name": parse_name(file_path), "file_path": file_path, **metadata_by_file_path.get(file_path, {}), } ) return res def parse_name(source: str) -> str: filename = os.path.basename(source) name, _ = os.path.splitext(filename) return name.replace("_", " ") def parse_domain(source: str) -> str: domain = source.split(os.sep)[1] return domain.replace("_", " ")