Spaces:
Runtime error
Runtime error
import os | |
import re | |
import pandas as pd | |
from urllib.parse import urlparse | |
import logging | |
logger = logging.getLogger(__name__) | |
logging.basicConfig( | |
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" | |
) | |
class UTILS: | |
def __init__(self): | |
pass | |
def split_text( | |
self, | |
text | |
): | |
text = text.split(',') | |
text = [t.strip() for t in text] | |
return text | |
def replace_newlines_and_spaces( | |
self, | |
text | |
): | |
# Replace all newline characters with spaces | |
text = text.replace("\n", " ") | |
# Replace multiple spaces with a single space | |
text = re.sub(r'\s+', ' ', text) | |
return text | |
def clean_df( | |
self, | |
df, | |
dropna=True, | |
fillna=False | |
): | |
if fillna: | |
df.fillna('', inplace=True) | |
if dropna: | |
df.dropna(inplace=True) | |
# df = df[~df.isna()] | |
df = df.drop_duplicates().reset_index(drop=True) | |
return df | |
def validate_url_format( | |
self, | |
urls, | |
url_type='urls' | |
): | |
valid_urls = [] | |
for url in urls: | |
result = urlparse(url) | |
# Check if the url is valid | |
if all([result.scheme, result.netloc]): | |
# Online PDF urls should end with .pdf extension | |
if url_type == 'online_pdf' and not url.endswith('.pdf'): | |
continue | |
valid_urls.append(url) | |
logging.info(f'Valid URLs are: {valid_urls}') | |
return valid_urls | |