138 lines
4.1 KiB
Python
138 lines
4.1 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
import argparse
|
||
|
import os
|
||
|
import re
|
||
|
from typing import List
|
||
|
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
|
||
|
def download_img(url, img_path) -> None:
|
||
|
try:
|
||
|
# Format path as pwd/pokedex_entry_number.png
|
||
|
filename = os.path.join(img_path, url.split("/")[-1])
|
||
|
except Exception as e:
|
||
|
raise SystemExit(e)
|
||
|
|
||
|
try:
|
||
|
response = requests.get(url, stream=True)
|
||
|
file_size = int(response.headers.get("Content-Length", 0))
|
||
|
progress = tqdm(
|
||
|
response.iter_content(1024),
|
||
|
f"Downloading {filename}",
|
||
|
total=file_size,
|
||
|
unit="B",
|
||
|
unit_scale=True,
|
||
|
unit_divisor=1024,
|
||
|
)
|
||
|
except requests.exceptions.RequestException as e:
|
||
|
raise SystemExit(e)
|
||
|
|
||
|
if not os.path.exists(filename):
|
||
|
with open(filename, "wb") as f:
|
||
|
for data in progress.iterable:
|
||
|
f.write(data)
|
||
|
progress.update(len(data))
|
||
|
else:
|
||
|
print(f"File {filename} already exists!")
|
||
|
|
||
|
|
||
|
def write_csv_entries(pokemons_list, csv_file) -> None:
|
||
|
if not os.path.exists(csv_file):
|
||
|
if ".csv" not in csv_file:
|
||
|
csv_file = csv_file + ".csv"
|
||
|
with open(csv_file, "w", encoding="utf-8") as f:
|
||
|
print(f"Writing entries to {csv_file}")
|
||
|
for pokemon_row in pokemons_list:
|
||
|
f.write(pokemon_row + "\n")
|
||
|
else:
|
||
|
print(f"File {csv_file} already exists!")
|
||
|
|
||
|
|
||
|
def find_pokemon_image(main_url) -> List:
|
||
|
try:
|
||
|
res = requests.get(main_url)
|
||
|
soup = BeautifulSoup(res.content, "html.parser")
|
||
|
return soup.find_all("table", {"class": "pokedex"})
|
||
|
except requests.exceptions.RequestException as e:
|
||
|
raise SystemExit(e)
|
||
|
|
||
|
|
||
|
def format_csv_entries(pokemons_table, db_url, pokedex_url, img_path) -> List:
|
||
|
pokedex_number = r"[0-9]{1,3}"
|
||
|
pokemon_pattern = r"[0-9]{1,3}.*html"
|
||
|
|
||
|
pokemons_list = []
|
||
|
for row in pokemons_table.find_all("tr"):
|
||
|
columns = row.find_all("td")[2]
|
||
|
current_pokemon = columns.get_text()
|
||
|
current_entry = re.search(pokedex_number, str(columns)).group()
|
||
|
anki_balise = (
|
||
|
'<img src="'
|
||
|
+ current_entry
|
||
|
+ '.png"><br>N°'
|
||
|
+ current_entry
|
||
|
+ " ;"
|
||
|
)
|
||
|
pokemons_list.append(anki_balise + current_pokemon)
|
||
|
|
||
|
link_balise = columns.find_all("a")
|
||
|
current_url = re.search(pokemon_pattern, str(link_balise)).group()
|
||
|
new_url = pokedex_url + current_url
|
||
|
try:
|
||
|
entry_pokemon = requests.get(new_url)
|
||
|
except requests.exceptions.RequestException as e:
|
||
|
raise SystemExit(e)
|
||
|
|
||
|
new_soup = BeautifulSoup(entry_pokemon.content, "html.parser")
|
||
|
pokemon_img_url = new_soup.find_all("img")[0].attrs.get("src")
|
||
|
if "http" not in pokemon_img_url:
|
||
|
pokemon_img_url = db_url + pokemon_img_url
|
||
|
download_img(pokemon_img_url, img_path)
|
||
|
return pokemons_list
|
||
|
|
||
|
|
||
|
def main() -> None:
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument(
|
||
|
"-g",
|
||
|
"--generation",
|
||
|
type=int,
|
||
|
help="<int> Generation to retrieve <1-7>",
|
||
|
required=True,
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"-p",
|
||
|
"--images-path",
|
||
|
help="Where to store Pokemons pictures, default to ./Pictures",
|
||
|
default="Pictures",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"-f",
|
||
|
"--filename",
|
||
|
help="Name of CSV produced in output, default to output.csv",
|
||
|
default="output.csv",
|
||
|
)
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
if not os.path.exists(args.images_path):
|
||
|
os.makedirs(args.images_path)
|
||
|
|
||
|
db_url = "https://www.pokemontrash.com/"
|
||
|
pokedex_url = db_url + "pokedex/"
|
||
|
main_url = pokedex_url + "liste-pokemon.php"
|
||
|
|
||
|
pokemons_soap = find_pokemon_image(main_url)
|
||
|
# Table index 0 is 7th generation
|
||
|
chosen_generation_table = pokemons_soap[7 - args.generation]
|
||
|
pokemons_list = format_csv_entries(
|
||
|
chosen_generation_table, db_url, pokedex_url, args.images_path
|
||
|
)
|
||
|
write_csv_entries(pokemons_list, args.filename)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|