Spaces:
Runtime error
Runtime error
# MUSS AUFGERÄUMT WERDEN | |
import json | |
import os | |
import subprocess | |
import PyPDF2 | |
import csv | |
import fitz # PyMuPDF | |
def extract_text_from_pdf(pdf_path): | |
""" | |
Extracts all text from a PDF file. | |
:param pdf_path: Path to the PDF file. | |
:return: Extracted text as a string. | |
""" | |
# Open the PDF file | |
doc = fitz.open(pdf_path) | |
# Initialize an empty string to hold the text | |
text = '' | |
# Iterate through each page in the PDF | |
for page_num in range(len(doc)): | |
# Get a page | |
page = doc.load_page(page_num) | |
# Extract text from the page and add it to the result | |
text += page.get_text() | |
# Close the document | |
doc.close() | |
return text | |
def read_pdfs_from_folder(folder_path): | |
""" | |
Reads all PDF files in the specified folder using PdfReader and extracts their text. | |
Parameters: | |
- folder_path: The path to the folder containing PDF files. | |
Returns: | |
- A dictionary with file names as keys and their extracted text as values. | |
""" | |
pdf_texts = {} | |
for filename in os.listdir(folder_path): | |
if filename.endswith('.pdf'): | |
file_path = os.path.join(folder_path, filename) | |
with open(file_path, 'rb') as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = '' | |
for page in pdf_reader.pages: | |
try: | |
text += page.extract_text() | |
except UnicodeDecodeError as e: | |
print(e) | |
for c in text: | |
if c in ["ä", "Ä"]: | |
text = text[:text.index(c)] + "ae" + text[text.index(c)+1:] | |
if c in ["ö", "Ö"]: | |
text = text[:text.index(c)] + "oe" + text[text.index(c)+1:] | |
if c in ["ü", "Ü"]: | |
text = text[:text.index(c)] + "ue" + text[text.index(c)+1:] | |
if c in [",", ";", "\\", '"']: | |
text = text[:text.index(c)] + "_" + text[text.index(c)+1:] | |
if c in ["/n", "\n"]: | |
text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:] | |
pdf_texts[filename] = text | |
return pdf_texts | |
def read_csv_lines_as_strings(filename): | |
""" | |
Opens a CSV file and returns each line as a string in a list. | |
Parameters: | |
- filename: The path to the CSV file. | |
Returns: | |
- A list of strings, each representing a line from the CSV file. | |
""" | |
lines_as_strings = [] | |
with open(filename, newline='') as csvfile: | |
try: | |
reader = csv.reader(csvfile) | |
for row in reader: | |
# Convert the row (a list of values) back into a comma-separated string | |
line_as_string = ','.join(row) | |
lines_as_strings.append(line_as_string) | |
except UnicodeDecodeError as e: | |
print(e) | |
return lines_as_strings | |
# Function to load data from JSON files | |
def load_data(filename): | |
with open(filename, 'r') as file: | |
try: | |
return json.load(file) | |
except UnicodeDecodeError as err: | |
print(err) | |
return {} | |
def find_and_open_file(filename, start_directory): | |
""" | |
Attempts to open a file with the given filename starting from the specified directory. | |
If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows. | |
""" | |
for root, dirs, files in os.walk(start_directory): | |
if filename in files: | |
filepath = os.path.join(root, filename) | |
print(f"File found: {filepath}") | |
return filepath | |
print(f"File {filename} not found.") | |
return None | |
def open_file(filepath): | |
""" | |
Opens the file with the default application, based on the operating system. | |
""" | |
if os.path.exists(filepath): | |
if os.name == 'posix': # Linux, macOS, etc. | |
subprocess.call(('open', filepath)) | |
elif os.name == 'nt': # Windows | |
os.startfile(filepath) | |
else: | |
print(f"Cannot open file on this operating system: {filepath}") | |
else: | |
print(f"File does not exist: {filepath}") | |
def list_folders_files_recursive(path, depth=0): | |
""" | |
Recursively lists all folders and files within the specified path, including subfolders. | |
Parameters: | |
- path: The directory path to list contents from. | |
- depth: The current depth of recursion (used for indentation in print statements). | |
Returns: | |
- None | |
""" | |
# Ensure the provided path is a directory | |
if not os.path.isdir(path): | |
print(f"The provided path '{path}' is not a valid directory.") | |
return | |
indent = ' ' * depth # Indentation based on recursion depth | |
folders, files = [], [] | |
# List all entries in the directory | |
for entry in os.listdir(path): | |
full_path = os.path.join(path, entry) | |
if os.path.isdir(full_path): | |
folders.append(entry) | |
print(f"{indent}Folder: {entry}") | |
# Recursively list subfolders and files | |
list_folders_files_recursive(full_path, depth + 1) | |
elif os.path.isfile(full_path): | |
files.append(entry) | |
for f in files: | |
print(f"{indent}File: {f}") | |
def list_folders_files(path): | |
""" | |
Lists all folders and files within the specified path. | |
Parameters: | |
- path: The directory path to list contents from. | |
Returns: | |
- A tuple of two lists: (folders, files). | |
""" | |
folders = [] | |
files = [] | |
# Ensure the provided path is a directory | |
if not os.path.isdir(path): | |
print(f"The provided path '{path}' is not a valid directory.") | |
return folders, files | |
# List all entries in the directory | |
for entry in os.listdir(path): | |
full_path = os.path.join(path, entry) | |
if os.path.isdir(full_path): | |
folders.append(entry) | |
elif os.path.isfile(full_path): | |
files.append(entry) | |
return folders, files | |
if __name__ == "__main__": | |
print("here are all functions that read files") | |