import os import urllib3 from bs4 import BeautifulSoup import requests # Main manga source. MAIN_URL = 'https://manganelo.com/manga/' # Manga name. MANGA_NAME = 'Kimetsu no Yaiba' # Manga name in the form of how appears in the url. MANGA_NAME_URL = 'kimetsu_no_yaiba/' # PC main file location. MANGA_DIR = 'E:\\Mangas\\' # PC main manga data location. MANGA_DATA_DIR = ''.join(['C:\\Users\\Lorentzeus\\Google Drive\\', 'Personal\\Python\\mangas\\data']) def get_chapters_list(main_url=MAIN_URL, manga_name_url=MANGA_NAME_URL, manga_name=MANGA_NAME, reverse_sorted=True): """ Retrieves chapter urls and names. Returns a list of lists containing the url and the title of the chapter. main_url: Main webpage name (source). manga_name_url: Name of the manga in the url format that's used by the webpage. manga_name: Actual name of the manga, as it appears in the webpage. reverse_sorted: Sorting of the final array. """ manga_url = ''.join([MAIN_URL, MANGA_NAME_URL]) # Not actually a file, but the content of the html. html = urllib3.PoolManager().request('GET', manga_url) # Get the data from the html and parse it. soup = BeautifulSoup(html.data, 'html.parser') # Get the "rows" class, this contains the url # and title data for each chapter. # Deletes the first tag, since it's not useful. soup_rows = soup.find_all('div', {'class': 'row'}) del soup_rows[0] # Creates a list to store date for each url and chapter name. chapter_list = [] for row in soup_rows: # Gets the url name from the a tag. href = row.a['href'] # Same, for the title. Deletes every ocurrance of the manga name, # unwanted characters and then gets everyword. title_words = row.a['title'].replace(manga_name, '').replace('?', '') title_words = title_words.replace(':', '').replace('-', '') title_words = title_words.replace('...', '').replace(',', '').split() # Doing all the work in oneliner doesn't work for some chapters, # for some reason. # title = '_'.join(row.a['title'].replace(manga_name, '') # .replace(':', '').replace('-', '').lower().split()) # Lowers every word and appends it to a new list, # then it gets joined with '_' as a sep. title_words_lower = [] for word in title_words: title_words_lower.append(word.lower()) title = '_'.join(title_words_lower) # print(href, title) chapter_list.append([href, title]) if reverse_sorted: return chapter_list[::-1] else: return chapter_list def chapters_list_to_csv(chapters_list, manga_name=MANGA_NAME): """ Creates a csv file from the input chapter_list. chapters_list: List of data of the chapters. manga_name: Name of the manga, folder naming friendly. """ # Adding '.csv' for csv creation. m_name_ext = ''.join([manga_name, '.csv']) # print(m_name) with open(m_name_ext, 'w') as outcsv: for chapter in chapters_list: outcsv.write(''.join([chapter[0], ',', chapter[1], '\n'])) def chapters_csv_to_list(chapter_csv): """ Gives a list of chaptesrs from a csv file. chapters_list: List of data of the chapters. """ out_chapters_list = [] with open(chapter_csv, 'r') as incsv: lines = incsv.readlines() for line in lines: out_chapters_list.append(line.strip().split(',')) return out_chapters_list def get_chapter_image_list(chapter_data): """ Gets the links for each image in the chapter, and returns a list of the links. Returns a list of the image urls and its file name. chapter_data: A list containing a url and a title. NOTE: Not for direct use with the result of 'get_chapters_list' """ # Not actually a file, but the content of the html. html = urllib3.PoolManager().request('GET', chapter_data[0]) # Get the data from the html and parse it. soup = BeautifulSoup(html.data, 'html.parser') # Get the "vung-doc" class, this contains a url for each page, # which redirects to the source of the image. # Deletes the first and last items, since they're trash. soup_img = soup.find_all('img') del soup_img[0] del soup_img[len(soup_img)-1] # Stores each image url in a list. image_url_list = [] for img in soup_img: # Gets the sring of the url, splits it by the char '/', # and gets the last item, which is the name of the file. image_url_list.append([img['src'], img['src'].split('/')[-1]]) return image_url_list def chapter_image_list_to_csv(chapter_data): """ Creates csv file for a chapter, given the list. chapter_data: A list containing a url and a title. """ ch_name = ''.join([chapter_data[1], '.csv']) chapter_image_list = get_chapter_image_list(chapter_data) with open(ch_name, 'w') as outcsv: for image in chapter_image_list: outcsv.write(''.join([image[0], ',', image[1], '\n'])) def chapter_image_csv_to_list(chapter_image_csv): """ Returns a list given the csv file. chapter_image_csv: csv containing data for the chapter. """ out_chapter_image_list = [] with open(chapter_image_csv, 'r') as incsv: lines = incsv.readlines() for line in lines: # print(line.strip().split(',')) out_chapter_image_list.append(line.strip().split(',')) return out_chapter_image_list def create_database(main_url=MAIN_URL, manga_name_url=MANGA_NAME_URL, manga_name=MANGA_NAME, manga_dir=MANGA_DIR, manga_data_dir=MANGA_DATA_DIR): """ Creates a database from zero, made of csv files. main_url: Main webpage name (source). manga_name_url: Name of the manga in the url format that's used by the webpage. manga_name: Actual name of the manga, as it appears in the webpage. manga_dir: Main manga folder in computer, subfolders here will be created. manga_data_dir: Main manga data folder in computer. NOTE: This does not updates the database. If a database already exists, omits the creation of new files. """ # A better "naming" for the manga, for use with folder creation. # As well as the name of the main database. m_name = '_'.join(word.lower() for word in manga_name.split()) m_name_ext = ''.join([m_name, '.csv']) # Navigate to where the main data folder is, # then to where the manga folder is. os.chdir(manga_data_dir) try: os.mkdir(m_name) os.chdir(m_name) except FileExistsError: print(''.join([m_name, ' folder already exists.'])) os.chdir(m_name) # List of files and folders in the current path. data_list_dir = os.listdir() # Get the list of chapters, if this already exists, # read it from the database. # This is the main manga data. if m_name_ext not in data_list_dir: chapters_list = get_chapters_list(main_url=main_url, manga_name_url=manga_name_url, manga_name=manga_name) chapters_list_to_csv(chapters_list=chapters_list, manga_name=m_name) else: print(''.join([m_name_ext, ' already exists.'])) chapters_list = chapters_csv_to_list(m_name_ext) # Data for each chapter. for chapter in chapters_list: # Get the list for the images of each chapter. chapter_name_ext = ''.join([chapter[1], '.csv']) if chapter_name_ext not in data_list_dir: chapter_image_list_to_csv(chapter) else: print(''.join([chapter_name_ext, ' already exists.'])) def update_database(main_url=MAIN_URL, manga_name_url=MANGA_NAME_URL, manga_name=MANGA_NAME, manga_dir=MANGA_DIR, manga_data_dir=MANGA_DATA_DIR): """ Updates the database already created, adding missing ones. main_url: Main webpage name (source). manga_name_url: Name of the manga in the url format that's used by the webpage. manga_name: Actual name of the manga, as it appears in the webpage. manga_dir: Main manga folder in computer, subfolders here will be created. manga_data_dir: Main manga data folder in computer. """ # A better "naming" for the manga, for use with folder creation. # As well as the name of the main database. m_name = '_'.join(word.lower() for word in manga_name.split()) m_name_ext = ''.join([m_name, '.csv']) # Navigate to where the main data folder is, # then to where the manga folder is. os.chdir(manga_data_dir) try: os.mkdir(m_name) os.chdir(m_name) except FileExistsError: print(''.join([m_name, ' folder already exists.'])) os.chdir(m_name) # Get a list of files present in path. data_list_dir = os.listdir() # First, download the data from the web. new_chapter_list = get_chapters_list(main_url=main_url, manga_name_url=manga_name_url, manga_name=manga_name, reverse_sorted=False) # And then, read the current database. last_chapter = chapters_csv_to_list(m_name_ext)[-1] # The missing chapters. missing_chapters = [] for chapter in new_chapter_list: # If we get to the last acquired chapter, exit loop. if chapter == last_chapter: break missing_chapters.append(chapter) # Reverse the order. missing_chapters = missing_chapters[::-1] # Write the last chapters to already existing csv file. # No need for checking if items are present, # since that's checked on the last steps, # that's how missing_chapters is acquired. with open(m_name_ext, 'a') as outcsv: for chapter in missing_chapters: outcsv.write(''.join([chapter[0], ',', chapter[1], '\n'])) # Create the missing csv data files for each chapter. for chapter in missing_chapters: # Get the list for the images of each chapter. chapter_name_ext = ''.join([chapter[1], '.csv']) if chapter_name_ext not in data_list_dir: chapter_image_list_to_csv(chapter) else: print(''.join([chapter_name_ext, ' already exists.'])) def download_image(image_list): """ Downloads an image from the specified url, and saves it with the specified name. image_list: list that contains url and name. """ # Gets the content of an image rom its url. img_data = requests.get(image_list[0]).content # Opens a file with its corresponding name as 'wb' (write, binary), # and then, writes the img_data. with open(image_list[1], 'wb') as handler: handler.write(img_data) def download_chapter(image_list): """ Downloads the whole chapter as images. image_url_list: List containing urls and file names for each image. """ for image in image_list: print(image) download_image(image) def download_manga(manga_name=MANGA_NAME, manga_dir=MANGA_DIR, manga_data_dir=MANGA_DATA_DIR): """ Downloads a whole manga, saving it to subfolders. Uses the database already created. that's used by the webpage. manga_name: Actual name of the manga, as it appears in the webpage. manga_dir: Main manga folder in computer, subfolders here will be created. manga_data_dir: Main manga data folder in computer. NOTE: This updates the manga, downloading the missing chapters if they're listed in the database. """ # A better "naming" for the manga, for use with folder creation. # As well as the name of the main database. m_name = '_'.join(word.lower() for word in manga_name.split()) m_name_ext = ''.join([m_name, '.csv']) # Go to where the database is located. os.chdir(manga_data_dir) try: os.chdir(m_name) except FileNotFoundError: print(''.join([m_name, ' folder doesn\'t exist.', ' Most likely, the database hasn\'t been created.'])) raise NameError('Create database first.') # Get info of the files in the database. data_list_dir = os.listdir() # Reads data from the main database. if m_name_ext not in data_list_dir: print(''.join([m_name, ' database hasn\'t been created.', ' Most likely, the database hasn\'t been created.'])) raise NameError('Create database first.') else: chapters_list = chapters_csv_to_list(m_name_ext) # Navigate to the main manga dir, # and either create or go to manga folder. os.chdir(manga_dir) try: os.mkdir(manga_name) os.chdir(manga_name) except FileExistsError: print(''.join([manga_name, ' folder already exists.'])) os.chdir(manga_name) # Get data of the folders in the manga folder. manga_list_dir = os.listdir() for chapter in chapters_list: # chapter_url = chapter[0] chapter_name = chapter[1] ch_name_ext = ''.join([chapter_name, '.csv']) if chapter_name not in manga_list_dir: print(''.join(['Downloading ', chapter_name, ' now.'])) # First, create the chapter folder. os.mkdir(chapter_name) # Go to where the database is located. os.chdir(manga_data_dir) os.chdir(m_name) chapter_image_list = chapter_image_csv_to_list(ch_name_ext) # Go back to where the manga is going ot be downloaded. os.chdir(manga_dir) os.chdir(manga_name) os.chdir(chapter_name) # Download all the chapter images on its respective folder. download_chapter(chapter_image_list) # Go back one folder to repeat the process # for the next chapter. os.chdir('..') else: print(''.join([chapter_name, ' already downloaded.'])) # # # TEST STUFF # # create_database() # update_database() # download_manga() # print(os.listdir()) # chapter_csv_to_df(chapter_csv='kimetsu_no_yaiba.csv') # download_manga() # chapter_list = get_chapters_list() # for chapter in chapter_list: # print(chapter) # chapters_list_to_csv(chapters_list=chapter_list) # os.chdir('data/kimetsu_no_yaiba') # for image in chapter_image_csv_to_list('chapter_1_cruelty.csv'): # print(image) # chapters_list = chapters_csv_to_list(chapter_csv='kimetsu_no_yaiba.csv') # chapter_image_list_to_csv(chapters_list[0]) # first_chapter_img_url_list = get_chapter_image_list(chapter_list[0]) # download_image(first_chapter_img_url_list[0]) # os.chdir(MANGA_DIR) # download_chapter(first_chapter_img_url_list) print('This is a test, bitch.')