File size: 18,561 Bytes
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccf039b
 
26998f0
ccf039b
 
 
 
 
 
 
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18c1e9
26998f0
 
 
 
 
 
 
 
 
b18c1e9
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
b18c1e9
26998f0
 
ccf039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26998f0
ccf039b
 
 
 
26998f0
ccf039b
 
26998f0
ccf039b
 
26998f0
ccf039b
26998f0
 
 
 
 
b18c1e9
 
 
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18c1e9
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18c1e9
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18c1e9
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccf039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccf039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26998f0
 
 
b18c1e9
 
26998f0
 
ccf039b
 
 
 
 
 
 
 
b18c1e9
 
26998f0
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
import streamlit as st

# setting page config. for centered mode
st.set_page_config(layout="centered")

from utils.footer import cust_footer

from lxml import etree
import justext
import concurrent.futures
import datetime
import requests
from bs4 import BeautifulSoup
import json

# ----- FUNCTIONS -----
# function to check whether the url is a sitemap or not
def check_sitemap(url):
    # Check the URL's ending
    if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
        try:
            # Parse the content as XML
            response = requests.get(url)
            xml_content = etree.fromstring(response.content)
            # Check for sitemap-specific elements
            if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
                return True
        except Exception as e:
            st.error("Invalid sitemap!!")
    # Additional conditions for identifying sitemaps
    elif 'sitemap' in url.lower():
        try:
            response = requests.get(url)
            # Perform additional checks specific to the website's structure or naming conventions
            return True
        except Exception as e:
            # st.error("Invalid sitemap!!")
            pass

    return False



def extract_urls_from_sitemaps(xml_url):
    # Make a GET request to the URL and extract the xml content
    response = requests.get(xml_url)

    soup = BeautifulSoup(response.text, 'xml')
    extracted_urls = []

    # check if the sitemap contains nested sitemaps
    sitemap_tags = soup.find_all('sitemap')
    if sitemap_tags:
        # Process nested sitemaps
        for sitemap_tag in sitemap_tags:
            print("sitemap_tags:" + str(sitemap_tag))
            nested_url = sitemap_tag.find('loc').text
            print('nested_url:', nested_url)
            nested_urls = extract_urls_from_sitemaps(nested_url)
            extracted_urls.extend(nested_urls)
    else:
        # Extract URLs from the current sitemap
        loc_tags = soup.find_all('loc')
        for loc_tag in loc_tags:
            # if loc_tag.parent.name != 'image':
            url = loc_tag.text
            if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
                print(f"url skipped because it is a {url.split('.')[-1]}")
            else:
                print('url:', url)
                extracted_urls.append(url)

    return extracted_urls



# function to check whether the entered url is valid
def valid_url(url):
    try:
        # Make a GET request to the URL and extract the text content
        response = requests.get(url)
        if response.status_code == 200:
            return True

    except requests.exceptions.RequestException as e:
        return False
    


# function to create a custom stoplist for justext
def custom_stoplist():
    odia_stopwords = [
        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
        "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
    ]
    return frozenset(odia_stopwords)



# function to extract data from url using justext
def extract_data_from_url_(url):
    response = requests.get(url)
    response.raise_for_status()
    page = response.content

    para = ""
    paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            para = para + '\n' + paragraph.text

    return para


sitemap_data = ""



# function to process a batch of URLS in sitemaps
def process_urls(sitemap_urls , category):

    extracted_txt = ""
    extracted_jsonl_list= []
    for url in sitemap_urls:
        if valid_url(url):
            print(url)
            # using justext to extract data
            temp_para = extract_data_from_url_(url)
            temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
            temp_jsonl_data = {"text": temp_para, "url": url, "category": category, "timestamp": str(datetime.datetime.now())}
            extracted_txt += temp_txt_data
            extracted_jsonl_list.append(temp_jsonl_data)
        else:
            st.error("Couldnt extract data from " + url)

    # Convert data_list to JSONL string
    extracted_jsonl_list_encoded = [json.dumps(data, ensure_ascii=False) for data in extracted_jsonl_list]
    extracted_jsonl = '\n'.join(extracted_jsonl_list_encoded)
    
    return extracted_txt, extracted_jsonl



# function to process for a single URL
def run_function(url , category):
    extracted_txt = ""

    try:
        response = requests.get(url)
        # Check if the user has provided a URL
        if url:
            if valid_url(url):
                temp_para = extract_data_from_url_(url)
                temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
                extracted_txt = temp_txt_data
                extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}

                # displaying extracted txt for single URL
                st.text_area("Extracted Text", value=extracted_txt, height=200)
                
                
                extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)

                # return extract status, and the data extracted
                return True, extracted_txt, extracted_jsonl
            else:
                return False, None, None
        else:
            st.error("Error: An error occurred while fetching content.")
            # return extract status, and the data extracted
            return False, None, None
    except Exception as e:
        st.error("Invalid URL")
    
    return False, None, None


def main():
    st.subheader("Extract Data from URLs")

    category = st.selectbox(
        'Select a Category',
       ('News Articles','Poems','Magazines', 'Other') )
    
    # dividing the body section into 2 columns for url and enter button
    col1, col2 = st.columns([0.7,0.3])

    with col1:
        url_or_xml = st.text_input(label='', placeholder="Enter URL")
        is_a_sitemap = check_sitemap(url_or_xml)

    with col2:
        st.write('##')
        if "button_enter_url" not in st.session_state:
            st.session_state.button_enter_url = False

        if st.button("Enter"):
            st.session_state.button_enter_url = True

    if "extracted_url" not in st.session_state:
        st.session_state.extracted_url = False
    data = ""

    

    # the enter button
    if st.session_state.button_enter_url:
        # check if it is a sitemap or not
        if is_a_sitemap:
            if "Initial" not in st.session_state:
                st.session_state.Initial = True
            # check whether its the initial state
            if st.session_state.Initial == True:
                
                xml = url_or_xml
                st.write("It is a sitemap")
                stored_sitemap_urls = extract_urls_from_sitemaps(xml)
                print('\nno. of urls: ', len(stored_sitemap_urls))
                st.write('no. of urls {}', format(len(stored_sitemap_urls)))

                if stored_sitemap_urls:
                    print(stored_sitemap_urls)
                    current_time = datetime.datetime.now()
                    print(current_time)
                    st.write(current_time)
                    # for sitemap_url in stored_sitemap_urls:

                    #     if valid_url(sitemap_url):
                    #         print(sitemap_url)
                    #         # using justext to extract data
                    #         data = data + extract_data_from_url_(sitemap_url)
                    #     else:
                    #         st.error("Couldnt extract data from " + sitemap_url)

                    num_threads = 16  # Number of threads to use

                    # Calculate the split size for each thread
                    split_size = len(stored_sitemap_urls) // num_threads

                    # Create a ThreadPoolExecutor with maximum `num_threads` threads
                    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
                        futures = []
                        for i in range(num_threads):
                            start_index = i * split_size
                            end_index = start_index + split_size if i != num_threads - 1 else None
                            temp_urls = stored_sitemap_urls[start_index:end_index]
                            future = executor.submit(process_urls, temp_urls, category)
                            futures.append(future)

                        # Retrieve the extracted data from each thread
                        text_data = []
                        jsonl_data = []
                        for future in futures:
                            text_result, jsonl_result = future.result()
                            text_data.append(text_result)
                            jsonl_data.append(jsonl_result)

                    # Combine the extracted data from all threads
                    combined_text_data = ''.join(text_data)
                    combined_jsonl_data = '\n'.join(jsonl_data)

                    # Use the combined data as needed
                    # print("Combined Text Data:")
                    # print(combined_text_data)
                    # print("Combined JSONL Data:")
                    # print(combined_jsonl_data)

                         

                    if "sitemap_data_jsonl" not in st.session_state:
                        st.session_state.sitemap_data_jsonl = combined_jsonl_data
                    if "sitemap_data_text" not in st.session_state:
                        st.session_state.sitemap_data_text = combined_text_data

                    

                    
                    current_time = datetime.datetime.now()
                    print(current_time)
                    st.write(current_time)
                    st.session_state.Initial = False
                    print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
                    st.session_state.extracted_url = True

                else:
                    st.error("Error: Invalid sitemap.")


        else:
            url = url_or_xml
            st.session_state.extracted_url, data_txt, data_jsonl = run_function(url , category)

        
        if st.session_state.extracted_url:
            # displaying extracted txt for sitemaps
            if is_a_sitemap:
                st.text_area("Extracted Text", value=st.session_state.sitemap_data_text, height=300)

            save_as,checkbox_c1, checkbox_c2 = st.columns([0.33 , 0.33 , 0.33])

            # initializing the checbox bool
            save_as_txt =False
            save_as_json = False
            saved_successfully = False
            
            with save_as:
                st.write("Save as ")
            with checkbox_c1:
                save_as_txt = st.checkbox("text", value=False)
                
            with checkbox_c2:
                save_as_json = st.checkbox("jsonl", value=False)
                
            if not save_as_txt and not save_as_json:
                clear_c1, clear_c2 = st.columns([0.5,0.5])
                with clear_c1:
                    if st.button("Clear"):
                        st.session_state.button_enter_url = False
                        st.session_state.Initial = True
                        st.session_state.extracted_url = False
                        if 'sitemap_data_text' in st.session_state:
                            del st.session_state['sitemap_data_text']
                        if 'sitemap_data_jsonl' in st.session_state:
                            del st.session_state['sitemap_data_jsonl']
                        st.session_state.button_enter_url = False
                        st.experimental_rerun()
                with clear_c2:
                    print()
            elif (save_as_txt and not save_as_json) or (save_as_json and not save_as_txt):
                col1, col2 = st.columns([0.5, 0.5])
                # save column
                with col1:
                    
                    if is_a_sitemap:
                        if save_as_txt:
                            if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
                                saved_successfully = True
                        if save_as_json:
                            if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
                                saved_successfully = True
                    else:
                        if save_as_txt:
                            if st.download_button(label="Save as txt",data=data_txt ):
                                saved_successfully = True
                        if save_as_json:
                            if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
                                saved_successfully = True

                # clear column                        
                with col2:
                    if st.button("Clear"):
                        st.session_state.button_enter_url = False
                        st.session_state.Initial = True
                        st.session_state.extracted_url = False
                        if 'sitemap_data_text' in st.session_state:
                            del st.session_state['sitemap_data_text']
                        if 'sitemap_data_jsonl' in st.session_state:
                            del st.session_state['sitemap_data_jsonl']
                        st.session_state.button_enter_url = False
                        st.experimental_rerun()
            elif save_as_txt and save_as_json:
                savetxt_c1,saveJson_c2,clear_c3 = st.columns([0.25,0.25,0.5])
                with savetxt_c1:
                    if is_a_sitemap:
                        if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
                                saved_successfully = True
                    else:            
                        if st.download_button(label="Save as txt",data=data_txt ):
                                saved_successfully = True
                with saveJson_c2:
                    if is_a_sitemap:
                        if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
                                saved_successfully = True
                    else:
                        if save_as_json:
                            if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
                                saved_successfully = True
                with clear_c3:
                    if st.button("Clear"):
                        st.session_state.button_enter_url = False
                        st.session_state.Initial = True
                        st.session_state.extracted_url = False
                        if 'sitemap_data_text' in st.session_state:
                            del st.session_state['sitemap_data_text']
                        if 'sitemap_data_jsonl' in st.session_state:
                            del st.session_state['sitemap_data_jsonl']
                        st.session_state.button_enter_url = False
                        st.experimental_rerun()
            if saved_successfully:
                # Confirmation message
                st.success(f"File saved successfully.")
            st.write("#")
            st.write("#")
        else:
            st.warning("Data not extracted")
            notextracted_c1,notextracted_c2 = st.columns([0.5,0.5])
            with notextracted_c1:
                if st.button("clear"):
                    st.session_state.button_enter_url = False
                    st.session_state.extracted_url = False
                    st.experimental_rerun()
            with notextracted_c2:
                print()
            st.write("#")
            st.write("#")


    # Add a success message to the sidebar
    st.sidebar.success("Select a page above.")

    # importing the custom footer from utils
    cust_footer()
    

if __name__ == "__main__":
    main()