Carlos Salgado commited on
Commit
793ea5f
1 Parent(s): cc9e69a

fix plaintext doc not being ingested

Browse files
Files changed (1) hide show
  1. backend/generate_metadata.py +32 -10
backend/generate_metadata.py CHANGED
@@ -8,6 +8,8 @@ from dotenv import load_dotenv
8
  from langchain_community.document_loaders import TextLoader
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
10
  from langchain_community.embeddings.fake import FakeEmbeddings
 
 
11
  from langchain_community.vectorstores import Vectara
12
 
13
  from schema import Metadata, BimDiscipline
@@ -24,27 +26,46 @@ vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
24
 
25
 
26
  def ingest(file_path):
27
- extension = filepath.split('.')[-1]
28
  ext = extension.lower()
29
  if ext == 'pdf':
30
  loader = UnstructuredPDFLoader(file_path)
31
  elif ext == 'txt':
32
  loader = TextLoader(file_path)
 
 
33
 
34
  # transform locally
35
  documents = loader.load()
36
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
 
 
 
 
 
 
 
 
 
 
 
 
37
  docs = text_splitter.split_documents(documents)
 
 
 
 
38
 
39
- vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
40
- retriever = vectara.as_retriever()
41
 
42
- return retriever
43
 
44
 
45
- def extract_metadata(filename):
46
- with open(filename, 'r') as f:
47
- context = f.readlines()
 
48
 
49
  # Create client
50
  client = openai.OpenAI(
@@ -63,7 +84,7 @@ def extract_metadata(filename):
63
  },
64
  {
65
  "role": "user",
66
- "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{' '.join(context)}"
67
  }
68
  ]
69
  )
@@ -82,5 +103,6 @@ if __name__ == "__main__":
82
  print("File '{}' not found or not accessible.".format(args.document))
83
  sys.exit(-1)
84
 
85
- metadata = extract_metadata(args.document)
 
86
  print(json.dumps(metadata, indent=2))
 
8
  from langchain_community.document_loaders import TextLoader
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
10
  from langchain_community.embeddings.fake import FakeEmbeddings
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+
13
  from langchain_community.vectorstores import Vectara
14
 
15
  from schema import Metadata, BimDiscipline
 
26
 
27
 
28
  def ingest(file_path):
29
+ extension = file_path.split('.')[-1]
30
  ext = extension.lower()
31
  if ext == 'pdf':
32
  loader = UnstructuredPDFLoader(file_path)
33
  elif ext == 'txt':
34
  loader = TextLoader(file_path)
35
+ else:
36
+ raise NotImplementedError('Only .txt or .pdf files are supported')
37
 
38
  # transform locally
39
  documents = loader.load()
40
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
41
+ separators=[
42
+ "\n\n",
43
+ "\n",
44
+ " ",
45
+ ",",
46
+ "\uff0c", # Fullwidth comma
47
+ "\u3001", # Ideographic comma
48
+ "\uff0e", # Fullwidth full stop
49
+ # "\u200B", # Zero-width space (Asian languages)
50
+ # "\u3002", # Ideographic full stop (Asian languages)
51
+ "",
52
+ ])
53
  docs = text_splitter.split_documents(documents)
54
+ #print(docs)
55
+
56
+ return docs
57
+
58
 
59
+ # vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
60
+ # retriever = vectara.as_retriever()
61
 
62
+ # return retriever
63
 
64
 
65
+ def extract_metadata(docs):
66
+ # plain text
67
+ context = "".join(
68
+ [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
69
 
70
  # Create client
71
  client = openai.OpenAI(
 
84
  },
85
  {
86
  "role": "user",
87
+ "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
88
  }
89
  ]
90
  )
 
103
  print("File '{}' not found or not accessible.".format(args.document))
104
  sys.exit(-1)
105
 
106
+ docs = ingest(args.document)
107
+ metadata = extract_metadata(docs)
108
  print(json.dumps(metadata, indent=2))