dromerosm commited on
Commit
608e720
1 Parent(s): 5ea6b3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -14
app.py CHANGED
@@ -1,32 +1,49 @@
1
  import gradio as gr
2
  import os
3
  import openai
4
- import newspaper
5
  import json
6
  import re
7
  from transformers import GPT2Tokenizer
 
 
 
 
 
8
 
9
 
10
  # define the text summarizer function
11
  def text_prompt(request, page_url, contraseña, temp):
12
  try:
13
- page = newspaper.Article(url=page_url)
14
- page.download()
 
 
 
 
15
  page.parse()
 
16
  except Exception as e:
17
  return "", f"--- Ha ocurrido un error al procesar la URL: {e} ---", ""
18
-
19
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
20
-
21
- tokens = tokenizer.tokenize(page.text)
22
 
23
- # Recortar el texto a un máximo de 1000 tokens
24
- num_tokens = len(tokens)
25
 
26
- if num_tokens > 1800:
27
- tokens = tokens[:1800]
28
 
29
- page_text = " ".join(tokens)
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  if num_tokens > 10:
32
  openai.api_key = contraseña
@@ -41,9 +58,11 @@ def text_prompt(request, page_url, contraseña, temp):
41
  )
42
  # get the response text
43
  response_text = response.choices[0].text
 
 
44
  # clean the response text
45
  response_text = re.sub(r'\s+', ' ', response_text)
46
- return page.text, response_text, num_tokens
47
  except Exception as e:
48
  return page.text, f"--- Ha ocurrido un error al procesar la solicitud: {e} ---", num_tokens
49
  return page.text, "--- Min number of tokens ---", num_tokens
@@ -56,7 +75,7 @@ iface = gr.Interface(
56
  gr.Textbox(lines=1, placeholder="Enter your API-key here...", label="API-Key:", type="password"),
57
  gr.Slider(0.0,1.0, value=0.3, label="Temperature:")
58
  ],
59
- outputs=[gr.Textbox(label="Input:"), gr.Textbox(label="Output:"), gr.Textbox(label="Tokens:")],
60
  examples=[["Summarize the following text as a list:","https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/","",0.3],
61
  ["Generate a summary of the following text. Give me an overview of main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:", "https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html","",0.7],
62
  ["Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):","https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/","",0.3]
 
1
  import gradio as gr
2
  import os
3
  import openai
4
+ from newspaper import Article
5
  import json
6
  import re
7
  from transformers import GPT2Tokenizer
8
+ import nltk
9
+ from nltk.tokenize import sent_tokenize
10
+ import requests
11
+
12
+ nltk.download('punkt')
13
 
14
 
15
  # define the text summarizer function
16
  def text_prompt(request, page_url, contraseña, temp):
17
  try:
18
+ headers = {'User-Agent': 'Chrome/83.0.4103.106'}
19
+ response = requests.get(page_url, headers=headers)
20
+ html = response.text
21
+
22
+ page = Article('')
23
+ page.set_html(html)
24
  page.parse()
25
+
26
  except Exception as e:
27
  return "", f"--- Ha ocurrido un error al procesar la URL: {e} ---", ""
 
 
 
 
28
 
29
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
30
+ sentences = sent_tokenize(page.text)
31
 
32
+ tokens = []
33
+ page_text = ""
34
 
35
+ for sentence in sentences:
36
+ tokens.extend(tokenizer.tokenize(sentence))
37
+
38
+ # Recortar el texto a un máximo de 1800 tokens
39
+ if len(tokens) > 1800:
40
+ break
41
+ page_text += sentence + " "
42
+
43
+ # Eliminar el ultimo espacio
44
+ page_text = page_text.strip()
45
+
46
+ num_tokens = len(tokens)
47
 
48
  if num_tokens > 10:
49
  openai.api_key = contraseña
 
58
  )
59
  # get the response text
60
  response_text = response.choices[0].text
61
+ total_tokens = response["usage"]["total_tokens"]
62
+
63
  # clean the response text
64
  response_text = re.sub(r'\s+', ' ', response_text)
65
+ return page.text, response_text, total_tokens
66
  except Exception as e:
67
  return page.text, f"--- Ha ocurrido un error al procesar la solicitud: {e} ---", num_tokens
68
  return page.text, "--- Min number of tokens ---", num_tokens
 
75
  gr.Textbox(lines=1, placeholder="Enter your API-key here...", label="API-Key:", type="password"),
76
  gr.Slider(0.0,1.0, value=0.3, label="Temperature:")
77
  ],
78
+ outputs=[gr.Textbox(label="Input:"), gr.Textbox(label="Output:"), gr.Textbox(label="Total Tokens:")],
79
  examples=[["Summarize the following text as a list:","https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/","",0.3],
80
  ["Generate a summary of the following text. Give me an overview of main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:", "https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html","",0.7],
81
  ["Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):","https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/","",0.3]