Spaces:
Runtime error
Runtime error
import csv | |
from typing import Any, Dict, List, Optional | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
from langchain.document_loaders.unstructured import ( | |
UnstructuredFileLoader, | |
validate_unstructured_version, | |
) | |
class CSVLoader(BaseLoader): | |
"""Loads a CSV file into a list of documents. | |
Each document represents one row of the CSV file. Every row is converted into a | |
key/value pair and outputted to a new line in the document's page_content. | |
The source for each document loaded from csv is set to the value of the | |
`file_path` argument for all documents by default. | |
You can override this by setting the `source_column` argument to the | |
name of a column in the CSV file. | |
The source of each document will then be set to the value of the column | |
with the name specified in `source_column`. | |
Output Example: | |
.. code-block:: txt | |
column1: value1 | |
column2: value2 | |
column3: value3 | |
""" | |
def __init__( | |
self, | |
file_path: str, | |
source_column: Optional[str] = None, | |
csv_args: Optional[Dict] = None, | |
encoding: Optional[str] = None, | |
): | |
""" | |
Args: | |
file_path: The path to the CSV file. | |
source_column: The name of the column in the CSV file to use as the source. | |
Optional. Defaults to None. | |
csv_args: A dictionary of arguments to pass to the csv.DictReader. | |
Optional. Defaults to None. | |
encoding: The encoding of the CSV file. Optional. Defaults to None. | |
""" | |
self.file_path = file_path | |
self.source_column = source_column | |
self.encoding = encoding | |
self.csv_args = csv_args or {} | |
def load(self) -> List[Document]: | |
"""Load data into document objects.""" | |
docs = [] | |
with open(self.file_path, newline="", encoding=self.encoding) as csvfile: | |
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore | |
for i, row in enumerate(csv_reader): | |
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items() if k != 'restkey') | |
try: | |
source = ( | |
row[self.source_column] | |
if self.source_column is not None | |
else self.file_path | |
) | |
except KeyError: | |
raise ValueError( | |
f"Source column '{self.source_column}' not found in CSV file." | |
) | |
metadata = {"source": source, "row": i} | |
doc = Document(page_content=content, metadata=metadata) | |
docs.append(doc) | |
return docs | |
class UnstructuredCSVLoader(UnstructuredFileLoader): | |
"""Loader that uses unstructured to load CSV files. Like other | |
Unstructured loaders, UnstructuredCSVLoader can be used in both | |
"single" and "elements" mode. If you use the loader in "elements" | |
mode, the CSV file will be a single Unstructured Table element. | |
If you use the loader in "elements" mode, an HTML representation | |
of the table will be available in the "text_as_html" key in the | |
document metadata. | |
Examples | |
-------- | |
from langchain.document_loaders.csv_loader import UnstructuredCSVLoader | |
loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements") | |
docs = loader.load() | |
""" | |
def __init__( | |
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any | |
): | |
""" | |
Args: | |
file_path: The path to the CSV file. | |
mode: The mode to use when loading the CSV file. | |
Optional. Defaults to "single". | |
**unstructured_kwargs: Keyword arguments to pass to unstructured. | |
""" | |
validate_unstructured_version(min_unstructured_version="0.6.8") | |
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) | |
def _get_elements(self) -> List: | |
from unstructured.partition.csv import partition_csv | |
return partition_csv(filename=self.file_path, **self.unstructured_kwargs) | |