File size: 1,342 Bytes
780c913 fe19632 419f9af fe19632 419f9af 780c913 fe19632 419f9af fe19632 419f9af fe19632 419f9af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import os
# can be used to add metadata to the index, for instance URL
metadata_by_file_path = {
"data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"},
"data/Confucianism/Analects of Confucius.pdf": {
"url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf"
},
}
def get_domains():
domains = []
for root, dirs, files in os.walk("data"):
for dir in dirs:
domains.append(dir)
return domains
def get_sources():
res = []
for root, dirs, files in os.walk("data"):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
print("file_path", file_path)
res.append(
{
"domain": parse_domain(file_path),
"name": parse_name(file_path),
"file_path": file_path,
**metadata_by_file_path.get(file_path, {}),
}
)
return res
def parse_name(source: str) -> str:
filename = os.path.basename(source)
name, _ = os.path.splitext(filename)
return name.replace("_", " ")
def parse_domain(source: str) -> str:
domain = source.split(os.sep)[1]
return domain.replace("_", " ")
|