summaryrefslogtreecommitdiff
path: root/src/tirante/get_chapters_list.py
blob: 83cefc74b441fb7f98fa7645d3b0c3f48987b55a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import urllib3
from bs4 import BeautifulSoup


def get_chapters_list(main_url,
                      manga_name_url,
                      manga_name,
                      reverse_sorted=True):
    """
    Retrieves chapter urls and names. Returns a list of lists
    containing the url and the title of the chapter.
    main_url: Main webpage name (source).
    manga_name_url: Name of the manga in the url format
    that's used by the webpage.
    manga_name: Actual name of the manga, as it appears in the webpage.
    reverse_sorted: Sorting of the final array.
    """

    manga_url = ''.join([main_url, manga_name_url])

    # Not actually a file, but the content of the html.
    html = urllib3.PoolManager().request('GET', manga_url)

    # Get the data from the html and parse it.
    soup = BeautifulSoup(html.data, 'html.parser')

    # Get the "rows" class, this contains the url
    # and title data for each chapter.
    # Deletes the first tag, since it's not useful.
    soup_rows = soup.find_all('div', {'class': 'row'})
    del soup_rows[0]

    # Creates a list to store date for each url and chapter name.
    chapter_list = []

    for row in soup_rows:

        # Gets the url name from the a tag.
        href = row.a['href']
        # Same, for the title. Deletes every ocurrance of the manga name,
        # unwanted characters and then gets everyword.
        title_words = row.a['title'].replace(manga_name, '').replace('?', '')
        title_words = title_words.replace(':', '').replace('-', '')
        title_words = title_words.replace('...', '').replace(',', '').split()

        # Doing all the work in oneliner doesn't work for some chapters,
        # for some reason.
        # title = '_'.join(row.a['title'].replace(manga_name, '')
        # .replace(':', '').replace('-', '').lower().split())

        # Lowers every word and appends it to a new list,
        # then it gets joined with '_' as a sep.
        title_words_lower = []
        for word in title_words:
            title_words_lower.append(word.lower())

        title = '_'.join(title_words_lower)

        # print(href, title)
        chapter_list.append([href, title])

    if reverse_sorted:
        return chapter_list[::-1]
    else:
        return chapter_list