fgtools/misc/scrape-scribd.py

113 lines
3.2 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import os
import argparse
import re
import requests
from PIL import Image
from bs4 import BeautifulSoup
from fgtools.utils import constants
json_pattern = r'(?<=content-url: ")https:\/\/html.scribdassets.com\/.+\.jsonp(?=")'
img_pattern = r'<img .+?\/>'
class JSPage:
def __init__(self, number, width, height, url):
self.number = number
self.width = width
self.height = height
self.url = url
def get_image(self):
text = requests.get(self.url).text
images = list(map(lambda s: BeautifulSoup(s.replace("\\", ""), features="lxml").body.find("img"), re.findall(img_pattern, text)))
print(len(images))
src_image = Image.open(requests.get(images[0]["orig"], stream=True).raw)
pil_image = Image.new("RGB", (self.width, self.height))
pil_image.paste((255, 255, 255), (0, 0, pil_image.size[0], pil_image.size[1]))
for image in images:
style = {}
for item in image["style"].split(";"):
item = item.split(":")
style[item[0]] = item[1].replace("px", "")
clip = style["clip"]
clip = {k: int(v) for k, v in zip(("top", "right", "bottom", "left"), clip[clip.find("(") + 1:-1].split(" "))}
cropped_src_image = src_image.copy().crop((clip["left"], clip["top"], clip["right"], clip["bottom"]))
pil_image.paste(cropped_src_image, (int(style["left"]) + clip["left"], int(style["top"]) + clip["top"]))
return pil_image
def parse_pages_script(script):
lines = list(map(str.strip, script.split("\n")[1:-1]))
number = 0
width = 0
height = 0
url = ""
pages = []
for line in lines:
if "pageNum" in line:
number = int(line.split(": ")[1][:-1])
elif "origWidth" in line:
width = int(line.split(": ")[1][:-1])
elif "origHeight" in line:
height = int(line.split(": ")[1][:-1])
elif "contentUrl" in line:
url = line.split(": ")[1][1:-1]
if number and width and height and url:
page = JSPage(number, width, height, url)
pages.append(page)
number = width = height = 0
url = ""
return pages
def download_pages(url, output):
html = BeautifulSoup(requests.get(url).text, features="lxml")
pages_script = html.body.find("div", attrs={"class": "outer_page_container"}).find("script", attrs={"type": "text/javascript"})
pages = sorted(parse_pages_script(str(pages_script)), key=lambda p: p.number)
paths = []
for page in pages:
path = os.path.join(constants.CACHEDIR, os.path.split(output)[-1] + f"-{page.number}.jpg")
paths.append(path)
page.get_image().save(path, "JPEG")
return paths
def write_pdf(paths, output):
print(f"Joining {len(paths)} JPG files into {output}", end="")
newpaths = " ".join([f'"{path}"' for path in paths])
os.system(f'img2pdf {newpaths} --output "{output}"')
print("done.")
print("Deleting JPG files … ", end="")
for path in paths:
os.remove(path)
print("done")
if __name__ == "__main__":
argp = argparse.ArgumentParser()
argp.add_argument(
"url",
help="URL to Scribd web page"
)
argp.add_argument(
"-o", "--output",
help="Output file",
required=True
)
args = argp.parse_args()
os.makedirs(os.path.join(*os.path.split(os.path.relpath(args.output))[:-1]) or ".", exist_ok=True)
paths = download_pages(args.url, args.output)
write_pdf(paths, args.output)