Spaces:

JournalistsonHF
/

ai-scraper

Running

App Files Files Community

fdaudens HF staff commited on Nov 3, 2024

Commit

037744c

1 Parent(s): b011c43

add json checker

Browse files

Files changed (2) hide show

.gitignore +5 -0
app.py +43 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+dev.ipynb
+.env
+app-test.py
+dev copie.ipynb
+output.csv

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 import gradio as gr
 import subprocess
 # Ensure Playwright installs required browsers and dependencies
 subprocess.run(["playwright", "install"])
@@ -34,6 +36,29 @@ graph_config = {
     },
     "embeddings": {"model_instance": embedder_model_instance}
 }
 def scrape_and_summarize(prompt, source):
     smart_scraper_graph = SmartScraperGraph(
@@ -42,9 +67,27 @@ def scrape_and_summarize(prompt, source):
         config=graph_config
     )
     result = smart_scraper_graph.run()
     exec_info = smart_scraper_graph.get_execution_info()
     return result, prettify_exec_info(exec_info)
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Scrape websites, no-code version")

 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 import gradio as gr
 import subprocess
+import json
+import re
 # Ensure Playwright installs required browsers and dependencies
 subprocess.run(["playwright", "install"])
     },
     "embeddings": {"model_instance": embedder_model_instance}
 }
+#######
+def clean_json_string(json_str):
+    """
+    Removes any comments or prefixes before the actual JSON content.
+    Returns the cleaned JSON string.
+    """
+    # Find the first occurrence of '{'
+    json_start = json_str.find('{')
+    if json_start == -1:
+        # If no '{' is found, try with '[' for arrays
+        json_start = json_str.find('[')
+        if json_start == -1:
+            return json_str  # Return original if no JSON markers found
+    # Extract everything from the first JSON marker
+    cleaned_str = json_str[json_start:]
+    # Verify it's valid JSON
+    try:
+        json.loads(cleaned_str)
+        return cleaned_str
+    except json.JSONDecodeError:
+        return json_str  # Return original if cleaning results in invalid JSON
 def scrape_and_summarize(prompt, source):
     smart_scraper_graph = SmartScraperGraph(
         config=graph_config
     )
     result = smart_scraper_graph.run()
+    # Clean the result if it's a string
+    if isinstance(result, str):
+        result = clean_json_string(result)
     exec_info = smart_scraper_graph.get_execution_info()
     return result, prettify_exec_info(exec_info)
+#######
+# def scrape_and_summarize(prompt, source):
+#     smart_scraper_graph = SmartScraperGraph(
+#         prompt=prompt,
+#         source=source,
+#         config=graph_config
+#     )
+#     result = smart_scraper_graph.run()
+#     exec_info = smart_scraper_graph.get_execution_info()
+#     return result, prettify_exec_info(exec_info)
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Scrape websites, no-code version")