pokemons_to_anki/pokemons_to_anki.py

138 lines
4.1 KiB
Python
Raw Permalink Normal View History

2023-07-09 16:12:36 +02:00
# -*- coding: utf-8 -*-
import argparse
import os
import re
from typing import List
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def download_img(url, img_path) -> None:
try:
# Format path as pwd/pokedex_entry_number.png
filename = os.path.join(img_path, url.split("/")[-1])
except Exception as e:
raise SystemExit(e)
try:
response = requests.get(url, stream=True)
file_size = int(response.headers.get("Content-Length", 0))
progress = tqdm(
response.iter_content(1024),
f"Downloading {filename}",
total=file_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
)
except requests.exceptions.RequestException as e:
raise SystemExit(e)
if not os.path.exists(filename):
with open(filename, "wb") as f:
for data in progress.iterable:
f.write(data)
progress.update(len(data))
else:
print(f"File {filename} already exists!")
def write_csv_entries(pokemons_list, csv_file) -> None:
if not os.path.exists(csv_file):
if ".csv" not in csv_file:
csv_file = csv_file + ".csv"
with open(csv_file, "w", encoding="utf-8") as f:
print(f"Writing entries to {csv_file}")
for pokemon_row in pokemons_list:
f.write(pokemon_row + "\n")
else:
print(f"File {csv_file} already exists!")
def find_pokemon_image(main_url) -> List:
try:
res = requests.get(main_url)
soup = BeautifulSoup(res.content, "html.parser")
return soup.find_all("table", {"class": "pokedex"})
except requests.exceptions.RequestException as e:
raise SystemExit(e)
def format_csv_entries(pokemons_table, db_url, pokedex_url, img_path) -> List:
pokedex_number = r"[0-9]{1,3}"
pokemon_pattern = r"[0-9]{1,3}.*html"
pokemons_list = []
for row in pokemons_table.find_all("tr"):
columns = row.find_all("td")[2]
current_pokemon = columns.get_text()
current_entry = re.search(pokedex_number, str(columns)).group()
anki_balise = (
'<img src="'
+ current_entry
+ '.png"><br>N°'
+ current_entry
+ " ;"
)
pokemons_list.append(anki_balise + current_pokemon)
link_balise = columns.find_all("a")
current_url = re.search(pokemon_pattern, str(link_balise)).group()
new_url = pokedex_url + current_url
try:
entry_pokemon = requests.get(new_url)
except requests.exceptions.RequestException as e:
raise SystemExit(e)
new_soup = BeautifulSoup(entry_pokemon.content, "html.parser")
pokemon_img_url = new_soup.find_all("img")[0].attrs.get("src")
if "http" not in pokemon_img_url:
pokemon_img_url = db_url + pokemon_img_url
download_img(pokemon_img_url, img_path)
return pokemons_list
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-g",
"--generation",
type=int,
help="<int> Generation to retrieve <1-7>",
required=True,
)
parser.add_argument(
"-p",
"--images-path",
help="Where to store Pokemons pictures, default to ./Pictures",
default="Pictures",
)
parser.add_argument(
"-f",
"--filename",
help="Name of CSV produced in output, default to output.csv",
default="output.csv",
)
args = parser.parse_args()
if not os.path.exists(args.images_path):
os.makedirs(args.images_path)
db_url = "https://www.pokemontrash.com/"
pokedex_url = db_url + "pokedex/"
main_url = pokedex_url + "liste-pokemon.php"
pokemons_soap = find_pokemon_image(main_url)
# Table index 0 is 7th generation
chosen_generation_table = pokemons_soap[7 - args.generation]
pokemons_list = format_csv_entries(
chosen_generation_table, db_url, pokedex_url, args.images_path
)
write_csv_entries(pokemons_list, args.filename)
if __name__ == "__main__":
main()