fdaudens HF staff commited on
Commit
037744c
1 Parent(s): b011c43

add json checker

Browse files
Files changed (2) hide show
  1. .gitignore +5 -0
  2. app.py +43 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ dev.ipynb
2
+ .env
3
+ app-test.py
4
+ dev copie.ipynb
5
+ output.csv
app.py CHANGED
@@ -6,6 +6,8 @@ from langchain_community.llms import HuggingFaceEndpoint
6
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
7
  import gradio as gr
8
  import subprocess
 
 
9
 
10
  # Ensure Playwright installs required browsers and dependencies
11
  subprocess.run(["playwright", "install"])
@@ -34,6 +36,29 @@ graph_config = {
34
  },
35
  "embeddings": {"model_instance": embedder_model_instance}
36
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def scrape_and_summarize(prompt, source):
39
  smart_scraper_graph = SmartScraperGraph(
@@ -42,9 +67,27 @@ def scrape_and_summarize(prompt, source):
42
  config=graph_config
43
  )
44
  result = smart_scraper_graph.run()
 
 
 
 
 
45
  exec_info = smart_scraper_graph.get_execution_info()
46
  return result, prettify_exec_info(exec_info)
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # Gradio interface
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# Scrape websites, no-code version")
 
6
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
7
  import gradio as gr
8
  import subprocess
9
+ import json
10
+ import re
11
 
12
  # Ensure Playwright installs required browsers and dependencies
13
  subprocess.run(["playwright", "install"])
 
36
  },
37
  "embeddings": {"model_instance": embedder_model_instance}
38
  }
39
+ #######
40
+ def clean_json_string(json_str):
41
+ """
42
+ Removes any comments or prefixes before the actual JSON content.
43
+ Returns the cleaned JSON string.
44
+ """
45
+ # Find the first occurrence of '{'
46
+ json_start = json_str.find('{')
47
+ if json_start == -1:
48
+ # If no '{' is found, try with '[' for arrays
49
+ json_start = json_str.find('[')
50
+ if json_start == -1:
51
+ return json_str # Return original if no JSON markers found
52
+
53
+ # Extract everything from the first JSON marker
54
+ cleaned_str = json_str[json_start:]
55
+
56
+ # Verify it's valid JSON
57
+ try:
58
+ json.loads(cleaned_str)
59
+ return cleaned_str
60
+ except json.JSONDecodeError:
61
+ return json_str # Return original if cleaning results in invalid JSON
62
 
63
  def scrape_and_summarize(prompt, source):
64
  smart_scraper_graph = SmartScraperGraph(
 
67
  config=graph_config
68
  )
69
  result = smart_scraper_graph.run()
70
+
71
+ # Clean the result if it's a string
72
+ if isinstance(result, str):
73
+ result = clean_json_string(result)
74
+
75
  exec_info = smart_scraper_graph.get_execution_info()
76
  return result, prettify_exec_info(exec_info)
77
 
78
+
79
+
80
+ #######
81
+ # def scrape_and_summarize(prompt, source):
82
+ # smart_scraper_graph = SmartScraperGraph(
83
+ # prompt=prompt,
84
+ # source=source,
85
+ # config=graph_config
86
+ # )
87
+ # result = smart_scraper_graph.run()
88
+ # exec_info = smart_scraper_graph.get_execution_info()
89
+ # return result, prettify_exec_info(exec_info)
90
+
91
  # Gradio interface
92
  with gr.Blocks() as demo:
93
  gr.Markdown("# Scrape websites, no-code version")