Fred808 commited on
Commit
2bfba61
·
verified ·
1 Parent(s): c102267

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -81
app.py CHANGED
@@ -14,18 +14,11 @@ from sentence_transformers import SentenceTransformer
14
  from bertopic import BERTopic
15
  import faiss
16
  import numpy as np
17
- from googleapiclient.discovery import build
18
- from youtube_transcript_api import YouTubeTranscriptApi
19
 
20
  # Initialize FastAPI app
21
  app = FastAPI()
22
 
23
- # YouTube Data API setup
24
- API_KEY = "AIzaSyDBdxA6KdOwtaaTgt26EBYRyvknOObmgAc"
25
- YOUTUBE_API_SERVICE_NAME = "youtube"
26
- YOUTUBE_API_VERSION = "v3"
27
- youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
28
-
29
  # Preprocessing function
30
  def preprocess_text(text):
31
  """
@@ -116,8 +109,6 @@ class SearchEngine:
116
  """
117
  Searches the index for the top_k most relevant documents.
118
  """
119
- if self.index is None:
120
- raise ValueError("Index not initialized. Call build_index() first.")
121
  query_embedding = self.model.encode(query, convert_to_tensor=True)
122
  distances, indices = self.index.search(query_embedding.cpu().detach().numpy().reshape(1, -1), top_k)
123
  return [(self.documents[i], distances[0][i]) for i in indices[0]]
@@ -155,58 +146,39 @@ documents = [
155
  ]
156
  search_engine.build_index(documents)
157
 
158
- # Fetch video metadata using YouTube Data API
159
- def fetch_video_metadata(video_id):
160
- request = youtube.videos().list(
161
- part="snippet,statistics",
162
- id=video_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  )
164
- response = request.execute()
165
- return response["items"][0] if response["items"] else None
166
-
167
-
168
- # Fetch video transcript using youtube-transcript-api
169
- def fetch_video_transcript(video_id):
170
- try:
171
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
172
- return " ".join([entry["text"] for entry in transcript])
173
- except Exception as e:
174
- print(f"Error fetching transcript: {e}")
175
- return None
176
-
177
-
178
- # Fetch and preprocess video data
179
- def fetch_and_preprocess_video_data(video_id):
180
- metadata = fetch_video_metadata(video_id)
181
- if not metadata:
182
- return None
183
-
184
- transcript = fetch_video_transcript(video_id)
185
-
186
- # Preprocess the data
187
- video_data = {
188
- "video_id": video_id,
189
- "video_link": f"https://www.youtube.com/watch?v={video_id}",
190
- "title": metadata["snippet"]["title"],
191
- "text": transcript if transcript else metadata["snippet"]["description"],
192
- "channel": metadata["snippet"]["channelTitle"],
193
- "channel_id": metadata["snippet"]["channelId"],
194
- "date": metadata["snippet"]["publishedAt"],
195
- "license": "Unknown",
196
- "original_language": "Unknown",
197
- "source_language": "Unknown",
198
- "transcription_language": "Unknown",
199
- "word_count": len(metadata["snippet"]["description"].split()),
200
- "character_count": len(metadata["snippet"]["description"]),
201
- }
202
- return video_data
203
 
 
 
 
 
 
 
204
 
205
  # Pydantic models for request validation
206
- class VideoRequest(BaseModel):
207
- video_id: str
208
-
209
-
210
  class TextRequest(BaseModel):
211
  text: str
212
 
@@ -221,35 +193,32 @@ class PromptRequest(BaseModel):
221
 
222
  # API Endpoints
223
  @app.post("/classify")
224
- async def classify(request: VideoRequest):
225
- video_id = request.video_id
226
- video_data = fetch_and_preprocess_video_data(video_id)
227
- if not video_data:
228
- raise HTTPException(status_code=400, detail="Failed to fetch video data")
229
 
230
- result = classifier.classify(video_data["text"])
231
  return {"result": result}
232
 
233
 
234
  @app.post("/relevance")
235
- async def relevance(request: VideoRequest):
236
- video_id = request.video_id
237
- video_data = fetch_and_preprocess_video_data(video_id)
238
- if not video_data:
239
- raise HTTPException(status_code=400, detail="Failed to fetch video data")
240
 
241
- relevant = relevance_detector.detect_relevance(video_data["text"])
242
  return {"relevant": relevant}
243
 
244
 
245
  @app.post("/summarize")
246
- async def summarize(request: VideoRequest):
247
- video_id = request.video_id
248
- video_data = fetch_and_preprocess_video_data(video_id)
249
- if not video_data:
250
- raise HTTPException(status_code=400, detail="Failed to fetch video data")
251
 
252
- summary = summarizer.summarize(video_data["text"])
253
  return {"summary": summary}
254
 
255
 
@@ -259,11 +228,8 @@ async def search(request: QueryRequest):
259
  if not query:
260
  raise HTTPException(status_code=400, detail="No query provided")
261
 
262
- try:
263
- results = search_engine.search(query)
264
- return {"results": results}
265
- except ValueError as e:
266
- raise HTTPException(status_code=500, detail=str(e))
267
 
268
 
269
  @app.post("/topics")
 
14
  from bertopic import BERTopic
15
  import faiss
16
  import numpy as np
17
+ from datasets import load_dataset, Features, Value
 
18
 
19
  # Initialize FastAPI app
20
  app = FastAPI()
21
 
 
 
 
 
 
 
22
  # Preprocessing function
23
  def preprocess_text(text):
24
  """
 
109
  """
110
  Searches the index for the top_k most relevant documents.
111
  """
 
 
112
  query_embedding = self.model.encode(query, convert_to_tensor=True)
113
  distances, indices = self.index.search(query_embedding.cpu().detach().numpy().reshape(1, -1), top_k)
114
  return [(self.documents[i], distances[0][i]) for i in indices[0]]
 
146
  ]
147
  search_engine.build_index(documents)
148
 
149
+ # Define the schema
150
+ features = Features({
151
+ "video_id": Value("string"),
152
+ "video_link": Value("string"),
153
+ "title": Value("string"),
154
+ "text": Value("string"),
155
+ "channel": Value("string"),
156
+ "channel_id": Value("string"),
157
+ "date": Value("string"),
158
+ "license": Value("string"),
159
+ "original_language": Value("string"),
160
+ "source_language": Value("string"),
161
+ "transcription_language": Value("string"),
162
+ "word_count": Value("int64"),
163
+ "character_count": Value("int64"),
164
+ })
165
+
166
+ # Load the dataset from Hugging Face Hub
167
+ try:
168
+ dataset = load_dataset(
169
+ "PleIAs/YouTube-Commons",
170
+ features=features,
171
+ streaming=True,
172
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Process the dataset
175
+ for example in dataset["train"]:
176
+ print(example) # Process each example
177
+ break # Stop after the first example for demonstration
178
+ except Exception as e:
179
+ print(f"Error loading dataset: {e}")
180
 
181
  # Pydantic models for request validation
 
 
 
 
182
  class TextRequest(BaseModel):
183
  text: str
184
 
 
193
 
194
  # API Endpoints
195
  @app.post("/classify")
196
+ async def classify(request: TextRequest):
197
+ text = request.text
198
+ if not text:
199
+ raise HTTPException(status_code=400, detail="No text provided")
 
200
 
201
+ result = classifier.classify(text)
202
  return {"result": result}
203
 
204
 
205
  @app.post("/relevance")
206
+ async def relevance(request: TextRequest):
207
+ text = request.text
208
+ if not text:
209
+ raise HTTPException(status_code=400, detail="No text provided")
 
210
 
211
+ relevant = relevance_detector.detect_relevance(text)
212
  return {"relevant": relevant}
213
 
214
 
215
  @app.post("/summarize")
216
+ async def summarize(request: TextRequest):
217
+ text = request.text
218
+ if not text:
219
+ raise HTTPException(status_code=400, detail="No text provided")
 
220
 
221
+ summary = summarizer.summarize(text)
222
  return {"summary": summary}
223
 
224
 
 
228
  if not query:
229
  raise HTTPException(status_code=400, detail="No query provided")
230
 
231
+ results = search_engine.search(query)
232
+ return {"results": results}
 
 
 
233
 
234
 
235
  @app.post("/topics")