Spaces:

CodeHima
/

TOSRoberta

Sleeping

CodeHima commited on Jul 3, 2024

Commit

8766819

1 Parent(s): 34e855f

feat: Add utility functions for text processing and model prediction

Files changed (3) hide show

utils/__init__.py ADDED Viewed

+# utils/__init__.py
+from .text_processing import extract_text_from_pdf, split_into_clauses
+from .model_utils import predict_unfairness

utils/model_utils.py ADDED Viewed

+import torch
+def predict_unfairness(text, model, tokenizer):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probabilities = torch.softmax(outputs.logits, dim=-1).squeeze()
+    predicted_class = torch.argmax(probabilities).item()
+    label_mapping = {0: 'clearly_fair', 1: 'potentially_unfair', 2: 'clearly_unfair'}
+    predicted_label = label_mapping[predicted_class]
+    return predicted_label, probabilities.tolist()

utils/text_processing.py ADDED Viewed

+import PyPDF2
+import spacy
+import re
+nlp = spacy.load("en_core_web_sm")
+def extract_text_from_pdf(pdf_file):
+    reader = PyPDF2.PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+def split_into_clauses(text):
+    # Preprocess the text
+    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
+    text = re.sub(r'\n+', '\n', text)  # Remove extra newlines
+    # Use spaCy to parse the text
+    doc = nlp(text)
+    clauses = []
+    current_clause = []
+    for sent in doc.sents:
+        current_clause.append(sent.text)
+        # Check if this sentence ends a clause
+        if re.search(r'\d+\.|\([a-z]\)|\([iv]+\)', sent.text) or len(' '.join(current_clause)) > 200:
+            clauses.append(' '.join(current_clause))
+            current_clause = []
+    # Add any remaining text as the last clause
+    if current_clause:
+        clauses.append(' '.join(current_clause))
+    # Post-process clauses
+    cleaned_clauses = []
+    for clause in clauses:
+        # Remove leading/trailing whitespace and numbers
+        clause = re.sub(r'^\s*\d+\.?\s*', '', clause.strip())
+        if clause:
+            cleaned_clauses.append(clause)
+    return cleaned_clauses