summaryrefslogtreecommitdiff
path: root/src/tirante/get_chapters_list.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/tirante/get_chapters_list.py')
-rw-r--r--src/tirante/get_chapters_list.py65
1 files changed, 65 insertions, 0 deletions
diff --git a/src/tirante/get_chapters_list.py b/src/tirante/get_chapters_list.py
new file mode 100644
index 0000000..83cefc7
--- /dev/null
+++ b/src/tirante/get_chapters_list.py
@@ -0,0 +1,65 @@
+import urllib3
+from bs4 import BeautifulSoup
+
+
+def get_chapters_list(main_url,
+ manga_name_url,
+ manga_name,
+ reverse_sorted=True):
+ """
+ Retrieves chapter urls and names. Returns a list of lists
+ containing the url and the title of the chapter.
+ main_url: Main webpage name (source).
+ manga_name_url: Name of the manga in the url format
+ that's used by the webpage.
+ manga_name: Actual name of the manga, as it appears in the webpage.
+ reverse_sorted: Sorting of the final array.
+ """
+
+ manga_url = ''.join([main_url, manga_name_url])
+
+ # Not actually a file, but the content of the html.
+ html = urllib3.PoolManager().request('GET', manga_url)
+
+ # Get the data from the html and parse it.
+ soup = BeautifulSoup(html.data, 'html.parser')
+
+ # Get the "rows" class, this contains the url
+ # and title data for each chapter.
+ # Deletes the first tag, since it's not useful.
+ soup_rows = soup.find_all('div', {'class': 'row'})
+ del soup_rows[0]
+
+ # Creates a list to store date for each url and chapter name.
+ chapter_list = []
+
+ for row in soup_rows:
+
+ # Gets the url name from the a tag.
+ href = row.a['href']
+ # Same, for the title. Deletes every ocurrance of the manga name,
+ # unwanted characters and then gets everyword.
+ title_words = row.a['title'].replace(manga_name, '').replace('?', '')
+ title_words = title_words.replace(':', '').replace('-', '')
+ title_words = title_words.replace('...', '').replace(',', '').split()
+
+ # Doing all the work in oneliner doesn't work for some chapters,
+ # for some reason.
+ # title = '_'.join(row.a['title'].replace(manga_name, '')
+ # .replace(':', '').replace('-', '').lower().split())
+
+ # Lowers every word and appends it to a new list,
+ # then it gets joined with '_' as a sep.
+ title_words_lower = []
+ for word in title_words:
+ title_words_lower.append(word.lower())
+
+ title = '_'.join(title_words_lower)
+
+ # print(href, title)
+ chapter_list.append([href, title])
+
+ if reverse_sorted:
+ return chapter_list[::-1]
+ else:
+ return chapter_list