Removed print statements and comments
Browse files- scrape_sources.py +6 -9
scrape_sources.py
CHANGED
@@ -26,13 +26,12 @@ class NPRLite(Source):
|
|
26 |
# and identified entities for each article.
|
27 |
# Chosen articles will have their data stored in a Summary object.
|
28 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
29 |
-
print("retrieving NPR article stub")
|
30 |
"""Creates article stubs for articles listed on text.npr.org"""
|
31 |
# Scrape NPR for headlines and links
|
32 |
soup = Soup(get(self.source_url))
|
33 |
# extract each headline
|
34 |
npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
35 |
-
#npr_hed = [i for i in npr_hed if 'Opinion:' not in i]
|
36 |
# links scraped are just the extension to the site's base link.
|
37 |
npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
38 |
# limit amount of data being returned for clustering
|
@@ -42,13 +41,13 @@ class NPRLite(Source):
|
|
42 |
# Create stubs with heds and links
|
43 |
# Test: do the headlines and links zipped together lineup correctly?
|
44 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
|
45 |
-
print(f"Number of npr articles: {len(npr_hed)}")
|
46 |
return article_tuples, len(npr_hed)
|
47 |
|
48 |
# Returns None if article is only 1 line.
|
49 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
50 |
"""Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
|
51 |
-
st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
|
52 |
container = Soup(get(self.source_url[:-5] + indata.link))
|
53 |
text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
|
54 |
if isinstance(text_container, Soup):
|
@@ -82,7 +81,6 @@ class CNNText(Source):
|
|
82 |
# Chosen articles will have their data stored in a Summary object.
|
83 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
84 |
"""Creates a stub for each article listed on lite.cnn.com"""
|
85 |
-
print("retrieving CNN article stub")
|
86 |
soup = Soup(get(self.source_url))
|
87 |
# Scrape NPR for headlines and links
|
88 |
cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
|
@@ -91,16 +89,15 @@ class CNNText(Source):
|
|
91 |
if limit is not None:
|
92 |
cnn_heds = cnn_heds[:limit]
|
93 |
cnn_links = cnn_links[:limit]
|
94 |
-
#cnn = [i for i in cnn_heds if 'Analysis:' not in i and 'Opinion:' not in i]
|
95 |
# Take this next line out of this function and place it where this data is used.
|
96 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
|
97 |
-
|
98 |
return article_tuples, len(cnn_heds)
|
99 |
|
100 |
# Returns None if article is only 1 line.
|
101 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
102 |
"""Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
|
103 |
-
print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
104 |
st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
105 |
repeat = 0
|
106 |
good = False
|
@@ -114,7 +111,7 @@ class CNNText(Source):
|
|
114 |
repeat += 1
|
115 |
if good:
|
116 |
story_container = container.find('div', {'class': 'afe4286c'})
|
117 |
-
print(story_container)
|
118 |
author = story_container.find('p',{'id':'byline'}).text
|
119 |
story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
|
120 |
#if isinstance(story_container, Soup):
|
|
|
26 |
# and identified entities for each article.
|
27 |
# Chosen articles will have their data stored in a Summary object.
|
28 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
29 |
+
#print("retrieving NPR article stub")
|
30 |
"""Creates article stubs for articles listed on text.npr.org"""
|
31 |
# Scrape NPR for headlines and links
|
32 |
soup = Soup(get(self.source_url))
|
33 |
# extract each headline
|
34 |
npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
|
|
35 |
# links scraped are just the extension to the site's base link.
|
36 |
npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
37 |
# limit amount of data being returned for clustering
|
|
|
41 |
# Create stubs with heds and links
|
42 |
# Test: do the headlines and links zipped together lineup correctly?
|
43 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
|
44 |
+
#print(f"Number of npr articles: {len(npr_hed)}")
|
45 |
return article_tuples, len(npr_hed)
|
46 |
|
47 |
# Returns None if article is only 1 line.
|
48 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
49 |
"""Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
|
50 |
+
#st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
|
51 |
container = Soup(get(self.source_url[:-5] + indata.link))
|
52 |
text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
|
53 |
if isinstance(text_container, Soup):
|
|
|
81 |
# Chosen articles will have their data stored in a Summary object.
|
82 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
83 |
"""Creates a stub for each article listed on lite.cnn.com"""
|
|
|
84 |
soup = Soup(get(self.source_url))
|
85 |
# Scrape NPR for headlines and links
|
86 |
cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
|
|
|
89 |
if limit is not None:
|
90 |
cnn_heds = cnn_heds[:limit]
|
91 |
cnn_links = cnn_links[:limit]
|
|
|
92 |
# Take this next line out of this function and place it where this data is used.
|
93 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
|
94 |
+
|
95 |
return article_tuples, len(cnn_heds)
|
96 |
|
97 |
# Returns None if article is only 1 line.
|
98 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
99 |
"""Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
|
100 |
+
#print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
101 |
st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
102 |
repeat = 0
|
103 |
good = False
|
|
|
111 |
repeat += 1
|
112 |
if good:
|
113 |
story_container = container.find('div', {'class': 'afe4286c'})
|
114 |
+
#print(story_container)
|
115 |
author = story_container.find('p',{'id':'byline'}).text
|
116 |
story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
|
117 |
#if isinstance(story_container, Soup):
|