sekaipedia-character-card-download: init
This commit is contained in:
commit
115720302c
3 changed files with 81 additions and 0 deletions
70
sekaipedia-character-card-download/main.py
Normal file
70
sekaipedia-character-card-download/main.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
#!/usr/bin/env python
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from platform import python_version
|
||||
import platform
|
||||
import os
|
||||
|
||||
|
||||
useragent = f"sekaipedia character card scraping bot (python {python_version()}, os {platform.system()}, host {platform.node()})"
|
||||
outdir = "out"
|
||||
print("using user agent:", useragent)
|
||||
print("using out dir:", outdir)
|
||||
os.makedirs
|
||||
|
||||
print("input the character card url from sekaipedia you want to scrape (eg. https://www.sekaipedia.org/wiki/Asahina_Mafuyu/Cards)")
|
||||
characterUrl = input("> ")
|
||||
r = requests.get(characterUrl, headers={"user-agent": useragent})
|
||||
if len(r.text) <= 500:
|
||||
# Print output incase of failures from wiki
|
||||
print(r.text)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
|
||||
def downloadWikiImage(title, page, name):
|
||||
# go to image's wiki page (to get full resolution)
|
||||
cardArt = "https://sekaipedia.org"+str(page.find("a", title=title).get("href"))
|
||||
r = requests.get(cardArt, headers={"user-agent": useragent})
|
||||
|
||||
imagePageSoup = BeautifulSoup(r.text, 'html.parser')
|
||||
imageLink = "https:"+str(imagePageSoup.find("a", class_="internal").get("href"))
|
||||
|
||||
print(f"found high resolution card {title} for {name} at {imageLink}")
|
||||
|
||||
# download files
|
||||
dldir = os.path.join(outdir, name)
|
||||
os.makedirs(dldir, exist_ok=True)
|
||||
|
||||
r = requests.get(imageLink, headers={"user-agent": useragent})
|
||||
with open(os.path.join(dldir, f"{title}.png"), mode="wb") as file:
|
||||
file.write(r.content)
|
||||
|
||||
|
||||
for table in soup.find_all(class_="wikitable"):
|
||||
for link in table.find_all('a'):
|
||||
if link.contents[0].name != None:
|
||||
continue
|
||||
|
||||
# go to card page
|
||||
cardlink = "https://sekaipedia.org"+str(link.get('href'))
|
||||
r = requests.get(cardlink, headers={"user-agent": useragent})
|
||||
|
||||
cardPageSoup = BeautifulSoup(r.text, 'html.parser')
|
||||
try:
|
||||
cardName = cardPageSoup.find("span", class_="mw-page-title-main").contents[0]
|
||||
except:
|
||||
cardName = cardPageSoup.find("h1", class_="firstHeading").contents[0]
|
||||
|
||||
if os.path.exists(os.path.join(outdir, cardName)):
|
||||
print(f"already downloaded cards for {cardName}")
|
||||
continue
|
||||
|
||||
print("getting cards for:", cardName)
|
||||
downloadWikiImage("Art", cardPageSoup, cardName)
|
||||
|
||||
try:
|
||||
downloadWikiImage("Trained art", cardPageSoup, cardName)
|
||||
except:
|
||||
print(f"Card {cardName} does not have a Trained Art")
|
||||
|
||||
sleep(3)
|
||||
Loading…
Add table
Add a link
Reference in a new issue