summaryrefslogtreecommitdiff
path: root/src/tirante/get_chapter_image_list.py
blob: a0881c6464190b98eeecd8da23b990812f842f95 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib3
from bs4 import BeautifulSoup


def get_chapter_image_list(chapter_data):
    """
    Gets the links for each image in the chapter,
    and returns a list of the links.
    Returns a list of the image urls and its file name.
    chapter_data: A list containing a url and a title.
    NOTE: Not for direct use with the result of 'get_chapters_list'
    """

    # Not actually a file, but the content of the html.
    html = urllib3.PoolManager().request('GET', chapter_data[0])

    # Get the data from the html and parse it.
    soup = BeautifulSoup(html.data, 'html.parser')

    # Get the "vung-doc" class, this contains a url for each page,
    # which redirects to the source of the image.
    # Deletes the first and last items, since they're trash.
    soup_img = soup.find_all('img')
    del soup_img[0]
    del soup_img[len(soup_img)-1]

    # Stores each image url in a list.
    image_url_list = []
    for img in soup_img:
        # Gets the sring of the url, splits it by the char '/',
        # and gets the last item, which is the name of the file.

        image_url_list.append([img['src'], img['src'].split('/')[-1]])

    return image_url_list