Spaces:

deepset
/

autoquizzer

Running

App Files Files Community

autoquizzer / backend /custom_components.py

anakin87

first commit

8bd40a9 9 months ago

raw

history blame

3.86 kB

	from pathlib import Path
	from typing import Any, Dict, List, Optional, Union

	from haystack import Document, component, logging
	from haystack.components.converters.utils import (
	get_bytestream_from_source,
	normalize_metadata,
	)
	from haystack.dataclasses import ByteStream

	from trafilatura import extract

	import json
	import json_repair

	logger = logging.getLogger(__name__)


	@component
	class TrafilaturaHTMLConverter:
	"""
	Converts an HTML file to a Document using Trafilatura.

	Usage example:
	```python
	converter = TrafilaturaHTMLConverter()
	results = converter.run(sources=["path/to/sample.html"])
	documents = results["documents"]
	print(documents[0].content)
	# 'This is a text from the HTML file.'
	```
	"""

	@component.output_types(documents=List[Document])
	def run(
	self,
	sources: List[Union[str, Path, ByteStream]],
	meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
	):
	"""
	Converts a list of HTML files to Documents.

	:param sources:
	List of HTML file paths or ByteStream objects.
	:param meta:
	Optional metadata to attach to the Documents.
	This value can be either a list of dictionaries or a single dictionary.
	If it's a single dictionary, its content is added to the metadata of all produced Documents.
	If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
	If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
	:param extract_kwargs:
	Additional keyword arguments to pass to the Trafilatura `extract` method.
	See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information.

	:returns:
	A dictionary with the following keys:
	- `documents`: Created Documents
	"""

	documents = []
	meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

	for source, metadata in zip(sources, meta_list):
	try:
	bytestream = get_bytestream_from_source(source=source)
	except Exception as e:
	logger.warning(
	"Could not read {source}. Skipping it. Error: {error}",
	source=source,
	error=e,
	)
	continue

	text = None
	try:
	text = extract(bytestream.data.decode("utf-8"))
	except Exception as conversion_e:
	logger.warning(
	"Failed to extract text from {source}. Error: {error}",
	source=source,
	error=conversion_e,
	)
	continue

	document = Document(content=text, meta={bytestream.meta, metadata})
	documents.append(document)

	return {"documents": documents}


	@component
	class QuizParser:
	@component.output_types(quiz=Dict)
	def run(self, replies: List[str]):
	reply = replies[0]

	# even if prompted to respond with JSON only, sometimes the model returns a mix of JSON and text
	first_index = min(reply.find("{"), reply.find("["))
	last_index = max(reply.rfind("}"), reply.rfind("]")) + 1

	json_portion = reply[first_index:last_index]

	try:
	quiz = json.loads(json_portion)
	except json.JSONDecodeError:
	# if the JSON is not well-formed, try to repair it
	quiz = json_repair.loads(json_portion)

	# sometimes the JSON contains a list instead of a dictionary
	if isinstance(quiz, list):
	quiz = quiz[0]

	print(quiz)

	return {"quiz": quiz}