diff options
Diffstat (limited to 'src/tirante/get_chapter_image_list.py')
-rw-r--r-- | src/tirante/get_chapter_image_list.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/src/tirante/get_chapter_image_list.py b/src/tirante/get_chapter_image_list.py new file mode 100644 index 0000000..a0881c6 --- /dev/null +++ b/src/tirante/get_chapter_image_list.py @@ -0,0 +1,35 @@ +import urllib3 +from bs4 import BeautifulSoup + + +def get_chapter_image_list(chapter_data): + """ + Gets the links for each image in the chapter, + and returns a list of the links. + Returns a list of the image urls and its file name. + chapter_data: A list containing a url and a title. + NOTE: Not for direct use with the result of 'get_chapters_list' + """ + + # Not actually a file, but the content of the html. + html = urllib3.PoolManager().request('GET', chapter_data[0]) + + # Get the data from the html and parse it. + soup = BeautifulSoup(html.data, 'html.parser') + + # Get the "vung-doc" class, this contains a url for each page, + # which redirects to the source of the image. + # Deletes the first and last items, since they're trash. + soup_img = soup.find_all('img') + del soup_img[0] + del soup_img[len(soup_img)-1] + + # Stores each image url in a list. + image_url_list = [] + for img in soup_img: + # Gets the sring of the url, splits it by the char '/', + # and gets the last item, which is the name of the file. + + image_url_list.append([img['src'], img['src'].split('/')[-1]]) + + return image_url_list |