Spaces:
Sleeping
Sleeping
no message
Browse files
main.py
CHANGED
@@ -108,14 +108,52 @@ def reduce_tokens(text: str):
|
|
108 |
token_count = len(reduced_doc)
|
109 |
return reduced_text, token_count
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
@app.post("/summarize")
|
112 |
async def summarize(request: TextRequest):
|
113 |
try:
|
|
|
114 |
processed_text = preprocess_text(request.text)
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
return {
|
117 |
-
"
|
118 |
-
"
|
119 |
}
|
120 |
|
121 |
except Exception as e:
|
|
|
108 |
token_count = len(reduced_doc)
|
109 |
return reduced_text, token_count
|
110 |
|
111 |
+
def segment_text(text: str, max_length=512):
|
112 |
+
# Use spaCy to divide the document into sentences
|
113 |
+
doc = nlp(text)
|
114 |
+
sentences = [sent.text for sent in doc.sents]
|
115 |
+
|
116 |
+
# Group sentences into segments of approximately max_length tokens
|
117 |
+
segments = []
|
118 |
+
current_segment = []
|
119 |
+
current_length = 0
|
120 |
+
|
121 |
+
for sentence in sentences:
|
122 |
+
sentence_length = len(sentence.split())
|
123 |
+
if current_length + sentence_length > max_length:
|
124 |
+
segments.append(' '.join(current_segment))
|
125 |
+
current_segment = [sentence]
|
126 |
+
current_length = sentence_length
|
127 |
+
else:
|
128 |
+
current_segment.append(sentence)
|
129 |
+
current_length += sentence_length
|
130 |
+
|
131 |
+
if current_segment:
|
132 |
+
segments.append(' '.join(current_segment))
|
133 |
+
|
134 |
+
return segments
|
135 |
+
|
136 |
+
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
|
137 |
+
|
138 |
+
def classify_segments(segments):
|
139 |
+
return [classifier(segment) for segment in segments]
|
140 |
+
|
141 |
@app.post("/summarize")
|
142 |
async def summarize(request: TextRequest):
|
143 |
try:
|
144 |
+
# Preprocess and segment the text
|
145 |
processed_text = preprocess_text(request.text)
|
146 |
+
segments = segment_text(processed_text)
|
147 |
+
|
148 |
+
# Classify each segment
|
149 |
+
classified_segments = classify_segments(segments)
|
150 |
+
|
151 |
+
# Optionally, reduce tokens for some specific task or summarize
|
152 |
+
reduced_texts = [reduce_tokens(segment)[0] for segment in segments]
|
153 |
+
|
154 |
return {
|
155 |
+
"classified_segments": classified_segments,
|
156 |
+
"reduced_texts": reduced_texts
|
157 |
}
|
158 |
|
159 |
except Exception as e:
|