commit 23c73f1bc9401e7b5163d3b3b0cb9b5d678a3de7 Author: syrell Date: Sun Jul 9 16:12:36 2023 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0561ce6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,161 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +.vscode/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6c9eef8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,11 @@ +Copyright (c) . All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.adoc b/README.adoc new file mode 100644 index 0000000..fd9385c --- /dev/null +++ b/README.adoc @@ -0,0 +1,54 @@ += Anki Pokemon flashcards generator + +== Context + +Wanting to learn the Pokedex using flashcards on Anki I haven't found decks which were convenient for my usage therefore I created a handy script that scrap Pokemon pictures and then format a csv file which is easily importable into Anki. + +== Procedure and usage + +This script will download every Pokedex entry for a given generation. It scraps https://www.pokemontrash.com/[Pokemon Trash] website which only contains entries from 1st to 7th generation but contains good quality Pokemon PNG pictures. Then, strings are formatted and written to a csv file that you'd then import into Anki. + +WARNING: Based on its nature, scrapping is not guaranteed to work in the future. Also, this script retrieves french Pokemon names. + +[source,shell] +----- +usage: pokemons_to_anki.py [-h] -g GENERATION [-p IMAGES_PATH] [-f FILENAME] + +optional arguments: + -h, --help show this help message and exit + -g GENERATION, --generation GENERATION + Generation to retrieve <1-7> + -p IMAGES_PATH, --images-path IMAGES_PATH + Where to store Pokemons pictures, default to ./Pictures + -f FILENAME, --filename FILENAME + Name of CSV produced in output, default to output.csv +----- + +== CSV Output and flashcard example + +[.right] +image:flashcard_example.png[Camérupt flashcard] + +[source, csv] +
N°252 ;Arcko +
N°253 ;Massko +
N°254 ;Jungko +
N°255 ;Poussifeu +
N°256 ;Galifeu +
N°257 ;Braségali +
N°258 ;Gobou +
N°259 ;Flobio +
N°260 ;Laggron +
N°261 ;Medhyèna +
N°262 ;Grahyèna +
N°263 ;Zigzaton +
N°264 ;Linéon +
N°265 ;Chenipotte +
N°266 ;Armulys +
N°267 ;Charmillon +
N°268 ;Blindalys +
N°269 ;Papinox +
N°270 ;Nénupiot +
N°271 ;Lombre +
N°272 ;Ludicolo +... diff --git a/flashcard_example.png b/flashcard_example.png new file mode 100644 index 0000000..4700761 Binary files /dev/null and b/flashcard_example.png differ diff --git a/pokemons_to_anki.py b/pokemons_to_anki.py new file mode 100644 index 0000000..9b0a96b --- /dev/null +++ b/pokemons_to_anki.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +import argparse +import os +import re +from typing import List + +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + + +def download_img(url, img_path) -> None: + try: + # Format path as pwd/pokedex_entry_number.png + filename = os.path.join(img_path, url.split("/")[-1]) + except Exception as e: + raise SystemExit(e) + + try: + response = requests.get(url, stream=True) + file_size = int(response.headers.get("Content-Length", 0)) + progress = tqdm( + response.iter_content(1024), + f"Downloading {filename}", + total=file_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + ) + except requests.exceptions.RequestException as e: + raise SystemExit(e) + + if not os.path.exists(filename): + with open(filename, "wb") as f: + for data in progress.iterable: + f.write(data) + progress.update(len(data)) + else: + print(f"File {filename} already exists!") + + +def write_csv_entries(pokemons_list, csv_file) -> None: + if not os.path.exists(csv_file): + if ".csv" not in csv_file: + csv_file = csv_file + ".csv" + with open(csv_file, "w", encoding="utf-8") as f: + print(f"Writing entries to {csv_file}") + for pokemon_row in pokemons_list: + f.write(pokemon_row + "\n") + else: + print(f"File {csv_file} already exists!") + + +def find_pokemon_image(main_url) -> List: + try: + res = requests.get(main_url) + soup = BeautifulSoup(res.content, "html.parser") + return soup.find_all("table", {"class": "pokedex"}) + except requests.exceptions.RequestException as e: + raise SystemExit(e) + + +def format_csv_entries(pokemons_table, db_url, pokedex_url, img_path) -> List: + pokedex_number = r"[0-9]{1,3}" + pokemon_pattern = r"[0-9]{1,3}.*html" + + pokemons_list = [] + for row in pokemons_table.find_all("tr"): + columns = row.find_all("td")[2] + current_pokemon = columns.get_text() + current_entry = re.search(pokedex_number, str(columns)).group() + anki_balise = ( + '
N°' + + current_entry + + " ;" + ) + pokemons_list.append(anki_balise + current_pokemon) + + link_balise = columns.find_all("a") + current_url = re.search(pokemon_pattern, str(link_balise)).group() + new_url = pokedex_url + current_url + try: + entry_pokemon = requests.get(new_url) + except requests.exceptions.RequestException as e: + raise SystemExit(e) + + new_soup = BeautifulSoup(entry_pokemon.content, "html.parser") + pokemon_img_url = new_soup.find_all("img")[0].attrs.get("src") + if "http" not in pokemon_img_url: + pokemon_img_url = db_url + pokemon_img_url + download_img(pokemon_img_url, img_path) + return pokemons_list + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "-g", + "--generation", + type=int, + help=" Generation to retrieve <1-7>", + required=True, + ) + parser.add_argument( + "-p", + "--images-path", + help="Where to store Pokemons pictures, default to ./Pictures", + default="Pictures", + ) + parser.add_argument( + "-f", + "--filename", + help="Name of CSV produced in output, default to output.csv", + default="output.csv", + ) + args = parser.parse_args() + + if not os.path.exists(args.images_path): + os.makedirs(args.images_path) + + db_url = "https://www.pokemontrash.com/" + pokedex_url = db_url + "pokedex/" + main_url = pokedex_url + "liste-pokemon.php" + + pokemons_soap = find_pokemon_image(main_url) + # Table index 0 is 7th generation + chosen_generation_table = pokemons_soap[7 - args.generation] + pokemons_list = format_csv_entries( + chosen_generation_table, db_url, pokedex_url, args.images_path + ) + write_csv_entries(pokemons_list, args.filename) + + +if __name__ == "__main__": + main()