Marroco93 commited on
Commit
0f9cd45
1 Parent(s): 6b74d17

no message

Browse files
Files changed (1) hide show
  1. main.py +41 -3
main.py CHANGED
@@ -108,14 +108,52 @@ def reduce_tokens(text: str):
108
  token_count = len(reduced_doc)
109
  return reduced_text, token_count
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  @app.post("/summarize")
112
  async def summarize(request: TextRequest):
113
  try:
 
114
  processed_text = preprocess_text(request.text)
115
- reduced_text, token_count = reduce_tokens(processed_text)
 
 
 
 
 
 
 
116
  return {
117
- "reduced_text": reduced_text,
118
- "token_count": token_count
119
  }
120
 
121
  except Exception as e:
 
108
  token_count = len(reduced_doc)
109
  return reduced_text, token_count
110
 
111
+ def segment_text(text: str, max_length=512):
112
+ # Use spaCy to divide the document into sentences
113
+ doc = nlp(text)
114
+ sentences = [sent.text for sent in doc.sents]
115
+
116
+ # Group sentences into segments of approximately max_length tokens
117
+ segments = []
118
+ current_segment = []
119
+ current_length = 0
120
+
121
+ for sentence in sentences:
122
+ sentence_length = len(sentence.split())
123
+ if current_length + sentence_length > max_length:
124
+ segments.append(' '.join(current_segment))
125
+ current_segment = [sentence]
126
+ current_length = sentence_length
127
+ else:
128
+ current_segment.append(sentence)
129
+ current_length += sentence_length
130
+
131
+ if current_segment:
132
+ segments.append(' '.join(current_segment))
133
+
134
+ return segments
135
+
136
+ classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
137
+
138
+ def classify_segments(segments):
139
+ return [classifier(segment) for segment in segments]
140
+
141
  @app.post("/summarize")
142
  async def summarize(request: TextRequest):
143
  try:
144
+ # Preprocess and segment the text
145
  processed_text = preprocess_text(request.text)
146
+ segments = segment_text(processed_text)
147
+
148
+ # Classify each segment
149
+ classified_segments = classify_segments(segments)
150
+
151
+ # Optionally, reduce tokens for some specific task or summarize
152
+ reduced_texts = [reduce_tokens(segment)[0] for segment in segments]
153
+
154
  return {
155
+ "classified_segments": classified_segments,
156
+ "reduced_texts": reduced_texts
157
  }
158
 
159
  except Exception as e: