Spaces:
Running
Running
import platform | |
import re | |
from pathlib import Path | |
from urllib.parse import quote, unquote | |
# What characters are forbidden in Windows and Linux directory names? | |
# https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names | |
INVALID_FILE_PATH_CHARS = [ | |
"\\", | |
"/", | |
":", | |
"*", | |
"?", | |
'"', | |
"<", | |
">", | |
"|", | |
"\n", | |
"\t", | |
"\r", | |
*[chr(i) for i in range(32)], | |
] | |
WINDOWS_INVALID_FILE_PATH_NAMES = [ | |
"con", | |
"prn", | |
"aux", | |
"nul", | |
*[f"com{i+1}" for i in range(10)], | |
*[f"lpt{i+1}" for i in range(10)], | |
] | |
class FilepathConverter: | |
def __init__(self, parent: str = None): | |
self.output_root = Path(__file__).parents[1] / "files" | |
self.parent = parent | |
def preprocess(self, input_string): | |
return input_string | |
def validate(self, input_string): | |
if not input_string: | |
return input_string | |
filename = input_string | |
for char in INVALID_FILE_PATH_CHARS: | |
filename = filename.replace(char, "_") | |
if platform.system() == "Windows": | |
filename_base = filename.split(".")[0] | |
if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES: | |
filename_base = filename_base + "_" | |
filename = ".".join([filename_base, *filename.split(".")[1:]]) | |
return filename | |
def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"): | |
if ext: | |
filename_ext = "." + filename.split(".")[-1] | |
if filename_ext.lower() not in accept_exts: | |
filename += ext | |
return filename | |
def convert(self, input_string, parent=None): | |
filename = self.preprocess(input_string) | |
filename = self.validate(filename) | |
filename = self.append_extension(filename) | |
parent = parent or self.parent | |
parent = self.validate(parent) | |
if parent: | |
filepath = self.output_root / parent / filename | |
else: | |
filepath = self.output_root / filename | |
self.filename = filename | |
self.filepath = filepath | |
return self.filepath | |
class UrlToFilepathConverter(FilepathConverter): | |
def __init__(self, parent: str = None): | |
super().__init__(parent) | |
self.output_root = self.output_root / "urls" | |
def preprocess(self, url): | |
filename = unquote(url.split("//")[1]) | |
return filename | |
class QueryToFilepathConverter(FilepathConverter): | |
def __init__(self, parent: str = None): | |
super().__init__(parent) | |
self.output_root = self.output_root / "queries" | |
if __name__ == "__main__": | |
query = "python ζη¨" | |
query_converter = QueryToFilepathConverter() | |
print(query_converter.convert(query)) | |
# url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html" | |
url = ( | |
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename" | |
) | |
url_converter = UrlToFilepathConverter(parent=query) | |
print(url_converter.convert(url)) | |