#!/usr/bin/env python from time import sleep from bs4 import BeautifulSoup import requests from platform import python_version import platform import os useragent = f"sekaipedia character card scraping bot (python {python_version()}, os {platform.system()}, host {platform.node()})" outdir = "out" print("using user agent:", useragent) print("using out dir:", outdir) os.makedirs print("input the character card url from sekaipedia you want to scrape (eg. https://www.sekaipedia.org/wiki/Asahina_Mafuyu/Cards)") characterUrl = input("> ") r = requests.get(characterUrl, headers={"user-agent": useragent}) if len(r.text) <= 500: # Print output incase of failures from wiki print(r.text) soup = BeautifulSoup(r.text, 'html.parser') def downloadWikiImage(title, page, name): # go to image's wiki page (to get full resolution) cardArt = "https://sekaipedia.org"+str(page.find("a", title=title).get("href")) r = requests.get(cardArt, headers={"user-agent": useragent}) imagePageSoup = BeautifulSoup(r.text, 'html.parser') imageLink = "https:"+str(imagePageSoup.find("a", class_="internal").get("href")) print(f"found high resolution card {title} for {name} at {imageLink}") # download files dldir = os.path.join(outdir, name) os.makedirs(dldir, exist_ok=True) r = requests.get(imageLink, headers={"user-agent": useragent}) with open(os.path.join(dldir, f"{title}.png"), mode="wb") as file: file.write(r.content) for table in soup.find_all(class_="wikitable"): for link in table.find_all('a'): if link.contents[0].name != None: continue # go to card page cardlink = "https://sekaipedia.org"+str(link.get('href')) r = requests.get(cardlink, headers={"user-agent": useragent}) cardPageSoup = BeautifulSoup(r.text, 'html.parser') try: cardName = cardPageSoup.find("span", class_="mw-page-title-main").contents[0] except: cardName = cardPageSoup.find("h1", class_="firstHeading").contents[0] if os.path.exists(os.path.join(outdir, cardName)): print(f"already downloaded cards for {cardName}") continue print("getting cards for:", cardName) downloadWikiImage("Art", cardPageSoup, cardName) try: downloadWikiImage("Trained art", cardPageSoup, cardName) except: print(f"Card {cardName} does not have a Trained Art") sleep(3)