Harsh502s commited on
Commit
849f315
·
1 Parent(s): 7fbe44b
Pages/Recommender App.py CHANGED
@@ -7,11 +7,10 @@ import pickle
7
  @st.cache_data
8
  def load_data():
9
  try:
10
- anime_data = pd.read_csv(r"anime_rec.csv")
11
- anime_posters = pd.read_csv(r"anime_data_cleaned.csv")
12
  except:
13
  st.error("Dataset Not Found")
14
- return anime_data, anime_posters
15
 
16
 
17
  # Uncomment this if you want to load the model
@@ -25,17 +24,17 @@ def load_data():
25
 
26
 
27
  # similarity = load_model()
28
- anime_data, anime_posters = load_data()
29
 
30
 
31
  # Fetching the poster and url of the anime
32
  def fetch_anime_url(anime_id):
33
- url = anime_posters[anime_posters["anime_id"] == anime_id].urls.values[0]
34
  return url
35
 
36
 
37
  def fetch_poster(anime_id):
38
- poster = anime_posters[anime_posters["anime_id"] == anime_id].poster.values[0]
39
  return poster
40
 
41
 
@@ -83,33 +82,33 @@ def recommender_page():
83
 
84
  if st.button("Recommendation"):
85
  if anime_select == "Top 8 Animes":
86
- top8 = anime_posters.sort_values("score", ascending=False).head(8)
87
  col1, col2, col3, col4 = st.columns(4)
88
  with col1:
89
- st.write(f"[{top8.iloc[0].title}]({top8.iloc[0].urls})")
90
  st.image(top8.iloc[0].poster)
91
  with col2:
92
- st.write(f"[{top8.iloc[1].title}]({top8.iloc[1].urls})")
93
  st.image(top8.iloc[1].poster)
94
  with col3:
95
- st.write(f"[{top8.iloc[2].title}]({top8.iloc[2].urls})")
96
  st.image(top8.iloc[2].poster)
97
  with col4:
98
- st.write(f"[{top8.iloc[3].title}]({top8.iloc[3].urls})")
99
  st.image(top8.iloc[3].poster)
100
 
101
  col5, col6, col7, col8 = st.columns(4)
102
  with col5:
103
- st.write(f"[{top8.iloc[4].title}]({top8.iloc[4].urls})")
104
  st.image(top8.iloc[4].poster)
105
  with col6:
106
- st.write(f"[{top8.iloc[5].title}]({top8.iloc[5].urls})")
107
  st.image(top8.iloc[5].poster)
108
  with col7:
109
- st.write(f"[{top8.iloc[6].title}]({top8.iloc[6].urls})")
110
  st.image(top8.iloc[6].poster)
111
  with col8:
112
- st.write(f"[{top8.iloc[7].title}]({top8.iloc[7].urls})")
113
  st.image(top8.iloc[7].poster)
114
  else:
115
  (
 
7
  @st.cache_data
8
  def load_data():
9
  try:
10
+ anime_data = pd.read_csv(r"rec_data.csv")
 
11
  except:
12
  st.error("Dataset Not Found")
13
+ return anime_data
14
 
15
 
16
  # Uncomment this if you want to load the model
 
24
 
25
 
26
  # similarity = load_model()
27
+ anime_data = load_data()
28
 
29
 
30
  # Fetching the poster and url of the anime
31
  def fetch_anime_url(anime_id):
32
+ url = anime_data[anime_data["anime_id"] == anime_id].urls.values[0]
33
  return url
34
 
35
 
36
  def fetch_poster(anime_id):
37
+ poster = anime_data[anime_data["anime_id"] == anime_id].poster.values[0]
38
  return poster
39
 
40
 
 
82
 
83
  if st.button("Recommendation"):
84
  if anime_select == "Top 8 Animes":
85
+ top8 = anime_data.sort_values("score", ascending=False).head(8)
86
  col1, col2, col3, col4 = st.columns(4)
87
  with col1:
88
+ st.write(f"[{top8.iloc[0].title}]({top8.iloc[0].anime_url})")
89
  st.image(top8.iloc[0].poster)
90
  with col2:
91
+ st.write(f"[{top8.iloc[1].title}]({top8.iloc[1].anime_url})")
92
  st.image(top8.iloc[1].poster)
93
  with col3:
94
+ st.write(f"[{top8.iloc[2].title}]({top8.iloc[2].anime_url})")
95
  st.image(top8.iloc[2].poster)
96
  with col4:
97
+ st.write(f"[{top8.iloc[3].title}]({top8.iloc[3].anime_url})")
98
  st.image(top8.iloc[3].poster)
99
 
100
  col5, col6, col7, col8 = st.columns(4)
101
  with col5:
102
+ st.write(f"[{top8.iloc[4].title}]({top8.iloc[4].anime_url})")
103
  st.image(top8.iloc[4].poster)
104
  with col6:
105
+ st.write(f"[{top8.iloc[5].title}]({top8.iloc[5].anime_url})")
106
  st.image(top8.iloc[5].poster)
107
  with col7:
108
+ st.write(f"[{top8.iloc[6].title}]({top8.iloc[6].anime_url})")
109
  st.image(top8.iloc[6].poster)
110
  with col8:
111
+ st.write(f"[{top8.iloc[7].title}]({top8.iloc[7].anime_url})")
112
  st.image(top8.iloc[7].poster)
113
  else:
114
  (
anime_rec.csv DELETED
The diff for this file is too large to render. See raw diff
 
anime_data_cleaned.csv → rec_data.csv RENAMED
The diff for this file is too large to render. See raw diff
 
scrape_anime.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ import requests
6
+ from tqdm import tqdm
7
+ import warnings as w
8
+
9
+ w.filterwarnings("ignore")
10
+
11
+
12
+ no_of_page = int(
13
+ BeautifulSoup(requests.get("https://aniwatch.to/az-list").content, "lxml")
14
+ .find("nav", attrs={"aria-label": "Page navigation"})
15
+ .find_all("li")[-1]
16
+ .find("a")["href"]
17
+ .split("=")[1]
18
+ )
19
+ landing_page_url = "https://aniwatch.to/az-list"
20
+ page_urls = [
21
+ f"{landing_page_url}/?page={i}" if i != 1 else landing_page_url
22
+ for i in range(1, no_of_page + 1)
23
+ ]
24
+
25
+ # Scraping the data from all the pages
26
+
27
+ anime_urls = []
28
+
29
+ for url in tqdm(page_urls):
30
+ page = requests.get(url)
31
+ soup = BeautifulSoup(page.content, "html.parser")
32
+
33
+ # Getting the url for the anime page
34
+
35
+ for anime in soup.find_all("div", class_="film-poster"):
36
+ anime = anime.find("a")["href"]
37
+ page = "https://aniwatch.to" + anime
38
+ anime_urls.append(page)
39
+ pass
40
+ pass
41
+
42
+ anime_url = pd.DataFrame(anime_urls, columns=["anime_url"])
43
+ anime_url.to_csv("anime_url.csv", index=False)
44
+
45
+
46
+ # def process_url(url):
47
+ # soup = BeautifulSoup(requests.get(url).content, "html.parser")
48
+
49
+ # anime_poster = soup.find("div", class_="film-poster").find("img")["src"]
50
+
51
+ # # Getting the name of the anime
52
+
53
+ # anime_title = soup.find("h2", class_="film-name dynamic-name").text
54
+
55
+ # # Getting the overview of the anime
56
+
57
+ # anime_overview = anime_overview = (
58
+ # soup.find("div", class_="item item-title w-hide")
59
+ # .find("div", class_="text")
60
+ # .text
61
+ # )
62
+
63
+ # # Creating an object of the div containing all the details of the anime
64
+
65
+ # soup = soup.find("div", class_="anisc-info")
66
+
67
+ # # Extract MAL Score
68
+ # mal_score_element = soup.find("span", {"class": "item-head"}, text="MAL Score:")
69
+ # anime_mal_score = (
70
+ # mal_score_element.find_next_sibling("span", {"class": "name"}).text.strip()
71
+ # if mal_score_element
72
+ # else "NA"
73
+ # )
74
+
75
+ # # Extract Studios
76
+ # studios_element = soup.find("span", {"class": "item-head"}, text="Studios:")
77
+ # anime_studio = (
78
+ # studios_element.find_next("a", {"class": "name"}).text.strip()
79
+ # if studios_element
80
+ # else "NA"
81
+ # )
82
+
83
+ # # Extract Producers
84
+ # producers_element = soup.find("span", {"class": "item-head"}, text="Producers:")
85
+ # anime_producer = (
86
+ # [
87
+ # producer.text.strip()
88
+ # for producer in producers_element.find_next_siblings("a")
89
+ # ]
90
+ # if producers_element
91
+ # else ["NA"]
92
+ # )
93
+
94
+ # # Extract Genres
95
+ # genres_element = soup.find("span", {"class": "item-head"}, text="Genres:")
96
+ # anime_genres = (
97
+ # [genre.text.strip() for genre in genres_element.find_next_siblings("a")]
98
+ # if genres_element
99
+ # else ["NA"]
100
+ # )
101
+
102
+ # return (
103
+ # anime_poster,
104
+ # anime_title,
105
+ # anime_overview,
106
+ # anime_mal_score,
107
+ # anime_studio,
108
+ # anime_producer,
109
+ # anime_genres,
110
+ # )
111
+
112
+
113
+ # def create_df_parallel(anime_urls, num_threads=4):
114
+ # anime_poster_list = []
115
+ # anime_title_list = []
116
+ # anime_overview_list = []
117
+ # anime_mal_score_list = []
118
+ # anime_studio_list = []
119
+ # anime_producer_list = []
120
+ # anime_genres_list = []
121
+
122
+ # with ThreadPoolExecutor(max_workers=num_threads) as executor:
123
+ # results = executor.map(process_url, anime_urls)
124
+
125
+ # for result in results:
126
+ # anime_poster_list.append(result[0])
127
+ # anime_title_list.append(result[1])
128
+ # anime_overview_list.append(result[2])
129
+ # anime_mal_score_list.append(result[3])
130
+ # anime_studio_list.append(result[4])
131
+ # anime_producer_list.append(result[5])
132
+ # anime_genres_list.append(result[6])
133
+
134
+ # anime_dict = {
135
+ # "anime_poster": anime_poster_list,
136
+ # "anime_title": anime_title_list,
137
+ # "anime_overview": anime_overview_list,
138
+ # "anime_mal_score": anime_mal_score_list,
139
+ # "anime_studio": anime_studio_list,
140
+ # "anime_producer": anime_producer_list,
141
+ # "anime_genres": anime_genres_list,
142
+ # }
143
+
144
+ # anime_df = pd.DataFrame(anime_dict)
145
+ # return anime_df
146
+
147
+
148
+ # anime_df = create_df_parallel(anime_urls)
149
+ # anime_df.head()
150
+ # anime_df.to_csv("anime_data.csv", index=False)
similarity_matrix.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6467e01f6ad99f76155ccae156d96e285d22408947461b215eb0772730298888
3
- size 248912835