autoquizzer / backend /custom_components.py
anakin87's picture
first commit
8bd40a9
raw
history blame
3.86 kB
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from haystack import Document, component, logging
from haystack.components.converters.utils import (
get_bytestream_from_source,
normalize_metadata,
)
from haystack.dataclasses import ByteStream
from trafilatura import extract
import json
import json_repair
logger = logging.getLogger(__name__)
@component
class TrafilaturaHTMLConverter:
"""
Converts an HTML file to a Document using Trafilatura.
Usage example:
```python
converter = TrafilaturaHTMLConverter()
results = converter.run(sources=["path/to/sample.html"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the HTML file.'
```
"""
@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a list of HTML files to Documents.
:param sources:
List of HTML file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
:param extract_kwargs:
Additional keyword arguments to pass to the Trafilatura `extract` method.
See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information.
:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""
documents = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:
logger.warning(
"Could not read {source}. Skipping it. Error: {error}",
source=source,
error=e,
)
continue
text = None
try:
text = extract(bytestream.data.decode("utf-8"))
except Exception as conversion_e:
logger.warning(
"Failed to extract text from {source}. Error: {error}",
source=source,
error=conversion_e,
)
continue
document = Document(content=text, meta={**bytestream.meta, **metadata})
documents.append(document)
return {"documents": documents}
@component
class QuizParser:
@component.output_types(quiz=Dict)
def run(self, replies: List[str]):
reply = replies[0]
# even if prompted to respond with JSON only, sometimes the model returns a mix of JSON and text
first_index = min(reply.find("{"), reply.find("["))
last_index = max(reply.rfind("}"), reply.rfind("]")) + 1
json_portion = reply[first_index:last_index]
try:
quiz = json.loads(json_portion)
except json.JSONDecodeError:
# if the JSON is not well-formed, try to repair it
quiz = json_repair.loads(json_portion)
# sometimes the JSON contains a list instead of a dictionary
if isinstance(quiz, list):
quiz = quiz[0]
print(quiz)
return {"quiz": quiz}