dinhquangson commited on
Commit
acf945a
·
verified ·
1 Parent(s): 19e7110

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -70
app.py CHANGED
@@ -4,6 +4,7 @@ from datasets import load_dataset
4
  from fastapi.middleware.cors import CORSMiddleware
5
  import pdfplumber
6
  import pytesseract
 
7
 
8
  # Loading
9
  import os
@@ -272,6 +273,8 @@ async def download_database():
272
  async def convert_upload_file(file: UploadFile = File(...)):
273
  import pytesseract
274
  from pdf2image import convert_from_path
 
 
275
 
276
  file_savePath = join(temp_path,file.filename)
277
 
@@ -287,78 +290,26 @@ async def convert_upload_file(file: UploadFile = File(...)):
287
  ocr_text = pytesseract.image_to_string(image,lang='vie')
288
  text=text+ocr_text+'\n'
289
 
290
- return text
291
-
292
- def get_type_name(element):
293
- return type(element).__name__
294
-
295
- def filter_by_type(elements, type):
296
- return [element for element in elements if get_type_name(element) == type]
297
-
298
- import re
299
-
300
- def extract_value_from_text(text, format):
301
- pattern = re.compile(format)
302
- match = pattern.search(text)
303
- if match:
304
- return match.group(0) # Use group(0) to get the entire match
305
- else:
306
- return None
307
-
308
- def filter_by_labels(elements, labels, format):
309
- for element in elements:
310
- for label in labels:
311
- if label.lower() in element.text.lower():
312
- return extract_value_from_text(element.text, format)
313
- return None
314
-
315
- def filter_by_values(elements, values):
316
- for element in elements:
317
- for value in values:
318
- if value.lower() in element.text.lower():
319
- return value
320
- return None
321
-
322
- def get_elements_by_schemas(elements, schemas):
323
- result_elements=[]
324
- for schema in schemas:
325
- result_element={}
326
- filterred_by_type_elements = filter_by_type(elements, schema['layout_type'])
327
- if 'labels' in schema:
328
- filterred_by_label_elements = filter_by_labels(filterred_by_type_elements, schema['labels'], schema['format'])
329
- if filterred_by_label_elements is not None:
330
- result_element[schema['name']] = filterred_by_label_elements
331
- result_elements.append(result_element)
332
- elif 'values' in schema:
333
- fitered_by_value_elements = filter_by_values(filterred_by_type_elements, schema['values'])
334
- if fitered_by_value_elements is not None:
335
- result_element[schema['name']] = fitered_by_value_elements
336
- result_elements.append(result_element)
337
- else:
338
- if filterred_by_type_elements is not None:
339
- result_element[schema['name']] = filterred_by_type_elements[0].text
340
- result_elements.append(result_element)
341
-
342
- return result_elements
343
-
344
-
345
- @app.post("/pdf2metadata/")
346
- async def extract_upload_file(file: UploadFile = File(...)):
347
- from unstructured.partition.pdf import partition_pdf
348
-
349
-
350
-
351
- file_savePath = join(temp_path,file.filename)
352
-
353
- with open(file_savePath,'wb') as f:
354
- shutil.copyfileobj(file.file, f)
355
 
356
- # Returns a List[Element] present in the pages of the parsed pdf document
357
- elements = partition_pdf(file_savePath, languages=["vie"])
358
 
359
- schemas = [{'name':'publisher','layout_type':'Title','position':0,'from_last':False},{'name':'number','layout_type':'Text','position':0,'from_last':False, 'label':['Số','Luật số']}]
360
-
361
- return get_elements_by_schemas(elements, schemas)
362
 
363
  @app.get("/")
364
  def api_home():
 
4
  from fastapi.middleware.cors import CORSMiddleware
5
  import pdfplumber
6
  import pytesseract
7
+ from models import Article, Chapter, Law
8
 
9
  # Loading
10
  import os
 
273
  async def convert_upload_file(file: UploadFile = File(...)):
274
  import pytesseract
275
  from pdf2image import convert_from_path
276
+ from octoai.client import OctoAI
277
+ from octoai.text_gen import ChatCompletionResponseFormat, ChatMessage
278
 
279
  file_savePath = join(temp_path,file.filename)
280
 
 
290
  ocr_text = pytesseract.image_to_string(image,lang='vie')
291
  text=text+ocr_text+'\n'
292
 
293
+ client = OctoAI()
294
+
295
+ completion = client.text_gen.create_chat_completion(
296
+ model="meta-llama-3-8b-instruct",
297
+ messages=[
298
+ ChatMessage(role="system", content="You are a helpful assistant."),
299
+ ChatMessage(role="user", content=text),
300
+ ],
301
+ max_tokens=512,
302
+ presence_penalty=0,
303
+ temperature=0.1,
304
+ top_p=0.9,
305
+ response_format=ChatCompletionResponseFormat(
306
+ type="json_object",
307
+ schema=Law.model_json_schema(),
308
+ ),
309
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ return {content:text,metadate:completion.choices[0].message.content}
 
312
 
 
 
 
313
 
314
  @app.get("/")
315
  def api_home():