diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..e919e4c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# Editor config -- see http://editorconfig.org/ +root = true + +# Unix-style newlines +[*] +end_of_line = lf +insert_final_newline = true + +# 4 space indentation +[*.py] +indent_style = space +indent_size = 4 diff --git a/slideslurp/__init__.py b/slideslurp/__init__.py index d76d8a7..4166888 100644 --- a/slideslurp/__init__.py +++ b/slideslurp/__init__.py @@ -1,40 +1,56 @@ +""" Slideslurp scrapes a PDF from SlideShare.net (LinkedIn). """ + import argparse import sys - +import urllib import bs4 import requests - -from reportlab.pdfgen import canvas - +import reportlab.pdfgen.canvas def parse_args(): + """ Parse the command-line arguments and return them as an object. + The returned object has the following properties: + - `url` (string): the URL of the slideshare to download; + - `out` (string): the output file path. """ descr = "Generate PDFs from Slideshare presentations" parser = argparse.ArgumentParser(description=descr) parser.add_argument('url', metavar='url', nargs=1, help='the URL to slurp') - parser.add_argument('--output', '-o', default='out.pdf', - help='the file to write to (default: out.pdf)') + parser.add_argument('--out', '--output', '-o', default=None, + help='force the output file name') - return parser.parse_args() + args = parser.parse_args() + # If the user doesn't specify an output filename, we'll deduce + # one from the URL. Here's an URL example: + # https://www.slideshare.net/FrisodeJong/iso-20022-for-dummies + # + # We're taking the path (`/FrisodeJong/iso-20022-for-dummies`), + # isolating the part after the second slash (the first slash + # always being at the first position), and appending `.pdf` to it. + if not args.out: + path = urllib.parse.urlparse(args.url[0]).path + args.out = '%s.pdf'%path[path.find('/', 1) + 1:] + return args def main(): + """ Main function of the program. + Uses the global command-line arguments (not custom ones)! """ args = parse_args() res = requests.get(args.url[0]) - tree = bs4.BeautifulSoup(res.text, "html.parser") - c = canvas.Canvas(args.output) + canvas = reportlab.pdfgen.canvas.Canvas(args.out) for img in tree.findAll("img", class_="slide_image"): img_url = img.attrs["data-full"] - page_width, page_height = c._pagesize - c.setPageRotation(90) - c.drawImage(img_url, 0, 0, page_height, page_width, preserveAspectRatio=True) - c.showPage() - c.save() - + page_width, page_height = canvas._pagesize + canvas.drawImage(img_url, 0, 0, page_height, page_width, + preserveAspectRatio=True) + canvas.setPageSize((page_height, page_width)) + canvas.showPage() + canvas.save() if __name__ == '__main__': main()