anakin87 commited on
Commit
083f13e
β€’
1 Parent(s): 2a15409

update Haystack and simplify

Browse files
backend/custom_components.py CHANGED
@@ -1,93 +1,10 @@
1
- from pathlib import Path
2
- from typing import Any, Dict, List, Optional, Union
3
 
4
- from haystack import Document, component, logging
5
- from haystack.components.converters.utils import (
6
- get_bytestream_from_source,
7
- normalize_metadata,
8
- )
9
- from haystack.dataclasses import ByteStream
10
-
11
- from trafilatura import extract
12
 
13
  import json
14
  import json_repair
15
 
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- @component
20
- class TrafilaturaHTMLConverter:
21
- """
22
- Converts an HTML file to a Document using Trafilatura.
23
-
24
- Usage example:
25
- ```python
26
- converter = TrafilaturaHTMLConverter()
27
- results = converter.run(sources=["path/to/sample.html"])
28
- documents = results["documents"]
29
- print(documents[0].content)
30
- # 'This is a text from the HTML file.'
31
- ```
32
- """
33
-
34
- @component.output_types(documents=List[Document])
35
- def run(
36
- self,
37
- sources: List[Union[str, Path, ByteStream]],
38
- meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
39
- ):
40
- """
41
- Converts a list of HTML files to Documents.
42
-
43
- :param sources:
44
- List of HTML file paths or ByteStream objects.
45
- :param meta:
46
- Optional metadata to attach to the Documents.
47
- This value can be either a list of dictionaries or a single dictionary.
48
- If it's a single dictionary, its content is added to the metadata of all produced Documents.
49
- If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
50
- If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
51
- :param extract_kwargs:
52
- Additional keyword arguments to pass to the Trafilatura `extract` method.
53
- See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information.
54
-
55
- :returns:
56
- A dictionary with the following keys:
57
- - `documents`: Created Documents
58
- """
59
-
60
- documents = []
61
- meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
62
-
63
- for source, metadata in zip(sources, meta_list):
64
- try:
65
- bytestream = get_bytestream_from_source(source=source)
66
- except Exception as e:
67
- logger.warning(
68
- "Could not read {source}. Skipping it. Error: {error}",
69
- source=source,
70
- error=e,
71
- )
72
- continue
73
-
74
- text = None
75
- try:
76
- text = extract(bytestream.data.decode("utf-8"))
77
- except Exception as conversion_e:
78
- logger.warning(
79
- "Failed to extract text from {source}. Error: {error}",
80
- source=source,
81
- error=conversion_e,
82
- )
83
- continue
84
-
85
- document = Document(content=text, meta={**bytestream.meta, **metadata})
86
- documents.append(document)
87
-
88
- return {"documents": documents}
89
-
90
-
91
  @component
92
  class QuizParser:
93
  @component.output_types(quiz=Dict)
 
1
+ from typing import Dict, List
 
2
 
3
+ from haystack import component
 
 
 
 
 
 
 
4
 
5
  import json
6
  import json_repair
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  @component
9
  class QuizParser:
10
  @component.output_types(quiz=Dict)
backend/pipelines.py CHANGED
@@ -1,4 +1,5 @@
1
- from .custom_components import TrafilaturaHTMLConverter, QuizParser
 
2
  from haystack.components.fetchers import LinkContentFetcher
3
  from haystack.components.generators import OpenAIGenerator
4
  from haystack.components.builders import PromptBuilder
@@ -37,7 +38,7 @@ text:
37
 
38
  quiz_generation_pipeline = Pipeline()
39
  quiz_generation_pipeline.add_component("link_content_fetcher", LinkContentFetcher())
40
- quiz_generation_pipeline.add_component("html_converter", TrafilaturaHTMLConverter())
41
  quiz_generation_pipeline.add_component(
42
  "prompt_builder", PromptBuilder(template=quiz_generation_template)
43
  )
 
1
+ from .custom_components import QuizParser
2
+ from haystack.components.converters import HTMLToDocument
3
  from haystack.components.fetchers import LinkContentFetcher
4
  from haystack.components.generators import OpenAIGenerator
5
  from haystack.components.builders import PromptBuilder
 
38
 
39
  quiz_generation_pipeline = Pipeline()
40
  quiz_generation_pipeline.add_component("link_content_fetcher", LinkContentFetcher())
41
+ quiz_generation_pipeline.add_component("html_converter", HTMLToDocument())
42
  quiz_generation_pipeline.add_component(
43
  "prompt_builder", PromptBuilder(template=quiz_generation_template)
44
  )
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- haystack-ai==2.1.2
2
- trafilatura
3
  json-repair
4
  gradio
 
1
+ haystack-ai==2.2.0
 
2
  json-repair
3
  gradio