From e8275c472b2f114ad435692236b8a6f46e4169b0 Mon Sep 17 00:00:00 2001 From: "Thomas \"Cakeisalie5\" Touhey" Date: Fri, 5 May 2017 16:40:01 +0200 Subject: [PATCH 1/3] Better file name detection. --- slideslurp/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/slideslurp/__init__.py b/slideslurp/__init__.py index d76d8a7..c98e58f 100644 --- a/slideslurp/__init__.py +++ b/slideslurp/__init__.py @@ -3,6 +3,7 @@ import sys import bs4 import requests +from urllib.parse import urlparse from reportlab.pdfgen import canvas @@ -13,11 +14,14 @@ def parse_args(): parser = argparse.ArgumentParser(description=descr) parser.add_argument('url', metavar='url', nargs=1, help='the URL to slurp') - parser.add_argument('--output', '-o', default='out.pdf', + parser.add_argument('--output', '-o', default=None, help='the file to write to (default: out.pdf)') - return parser.parse_args() - + args = parser.parse_args() + if not args.output: + path = urlparse(args.url[0]).path + args.output = '%s.pdf'%path[path.find('/', 1) + 1:] + return args def main(): args = parse_args() @@ -31,7 +35,8 @@ def main(): img_url = img.attrs["data-full"] page_width, page_height = c._pagesize c.setPageRotation(90) - c.drawImage(img_url, 0, 0, page_height, page_width, preserveAspectRatio=True) + c.drawImage(img_url, 0, 0, page_height, page_width, + preserveAspectRatio=True) c.showPage() c.save() From 6e9cc213ad52601f9520304adcd0bd8dc1280230 Mon Sep 17 00:00:00 2001 From: "Thomas \"Cakeisalie5\" Touhey" Date: Fri, 5 May 2017 17:04:03 +0200 Subject: [PATCH 2/3] Editorconfig, lintian, slightly better output. --- .editorconfig | 12 ++++++++++++ slideslurp/__init__.py | 37 ++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 19 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..e919e4c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# Editor config -- see http://editorconfig.org/ +root = true + +# Unix-style newlines +[*] +end_of_line = lf +insert_final_newline = true + +# 4 space indentation +[*.py] +indent_style = space +indent_size = 4 diff --git a/slideslurp/__init__.py b/slideslurp/__init__.py index c98e58f..a36cb74 100644 --- a/slideslurp/__init__.py +++ b/slideslurp/__init__.py @@ -1,45 +1,44 @@ +""" Slideslurp scrapes a PDF from SlideShare.net (LinkedIn). """ + import argparse import sys - +import urllib import bs4 import requests -from urllib.parse import urlparse - -from reportlab.pdfgen import canvas - +import reportlab.pdfgen.canvas def parse_args(): + """ Parse the command-line arguments. """ descr = "Generate PDFs from Slideshare presentations" parser = argparse.ArgumentParser(description=descr) parser.add_argument('url', metavar='url', nargs=1, help='the URL to slurp') - parser.add_argument('--output', '-o', default=None, - help='the file to write to (default: out.pdf)') + parser.add_argument('--out', '--output', '-o', default=None, + help='force the output file name') args = parser.parse_args() - if not args.output: - path = urlparse(args.url[0]).path - args.output = '%s.pdf'%path[path.find('/', 1) + 1:] + if not args.out: + path = urllib.parse.urlparse(args.url[0]).path + args.out = '%s.pdf'%path[path.find('/', 1) + 1:] return args def main(): + """ Main function. """ args = parse_args() res = requests.get(args.url[0]) - tree = bs4.BeautifulSoup(res.text, "html.parser") - c = canvas.Canvas(args.output) + canvas = reportlab.pdfgen.canvas.Canvas(args.out) for img in tree.findAll("img", class_="slide_image"): img_url = img.attrs["data-full"] - page_width, page_height = c._pagesize - c.setPageRotation(90) - c.drawImage(img_url, 0, 0, page_height, page_width, - preserveAspectRatio=True) - c.showPage() - c.save() - + page_width, page_height = canvas._pagesize + canvas.drawImage(img_url, 0, 0, page_height, page_width, + preserveAspectRatio=True) + canvas.setPageSize((page_height, page_width)) + canvas.showPage() + canvas.save() if __name__ == '__main__': main() From 6860f7bb312d56cc4f2965ef455112ae510b25cf Mon Sep 17 00:00:00 2001 From: "Thomas \"Cakeisalie5\" Touhey" Date: Sat, 6 May 2017 16:32:06 +0200 Subject: [PATCH 3/3] Enhanced docstrings, commented an unclear bit of my contribution. --- slideslurp/__init__.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/slideslurp/__init__.py b/slideslurp/__init__.py index a36cb74..4166888 100644 --- a/slideslurp/__init__.py +++ b/slideslurp/__init__.py @@ -8,7 +8,10 @@ import requests import reportlab.pdfgen.canvas def parse_args(): - """ Parse the command-line arguments. """ + """ Parse the command-line arguments and return them as an object. + The returned object has the following properties: + - `url` (string): the URL of the slideshare to download; + - `out` (string): the output file path. """ descr = "Generate PDFs from Slideshare presentations" parser = argparse.ArgumentParser(description=descr) @@ -18,13 +21,22 @@ def parse_args(): help='force the output file name') args = parser.parse_args() + + # If the user doesn't specify an output filename, we'll deduce + # one from the URL. Here's an URL example: + # https://www.slideshare.net/FrisodeJong/iso-20022-for-dummies + # + # We're taking the path (`/FrisodeJong/iso-20022-for-dummies`), + # isolating the part after the second slash (the first slash + # always being at the first position), and appending `.pdf` to it. if not args.out: path = urllib.parse.urlparse(args.url[0]).path args.out = '%s.pdf'%path[path.find('/', 1) + 1:] return args def main(): - """ Main function. """ + """ Main function of the program. + Uses the global command-line arguments (not custom ones)! """ args = parse_args() res = requests.get(args.url[0])