Merge pull request #1 from cakeisalie5/master

Corrected one or two things.
2017-05-06 17:07:13 +02:00
parent f38a6fb596 6860f7bb31
commit 2b30157636
2 changed files with 43 additions and 15 deletions
@@ -0,0 +1,12 @@
+# Editor config -- see http://editorconfig.org/
+root = true
+
+# Unix-style newlines
+[*]
+end_of_line = lf
+insert_final_newline = true
+
+# 4 space indentation
+[*.py]
+indent_style = space
+indent_size = 4
@@ -1,40 +1,56 @@
+""" Slideslurp scrapes a PDF from SlideShare.net (LinkedIn). """
+
 import argparse
 import sys
-
+import urllib
 import bs4
 import requests
-
-from reportlab.pdfgen import canvas
-
+import reportlab.pdfgen.canvas

 def parse_args():
+    """ Parse the command-line arguments and return them as an object.
+    The returned object has the following properties:
+    - `url` (string): the URL of the slideshare to download;
+    - `out` (string): the output file path. """
    descr = "Generate PDFs from Slideshare presentations"

    parser = argparse.ArgumentParser(description=descr)
    parser.add_argument('url', metavar='url', nargs=1,
                        help='the URL to slurp')
-    parser.add_argument('--output', '-o', default='out.pdf',
-                        help='the file to write to (default: out.pdf)')
+    parser.add_argument('--out', '--output', '-o', default=None,
+                        help='force the output file name')

-    return parser.parse_args()
+    args = parser.parse_args()

+    # If the user doesn't specify an output filename, we'll deduce
+    # one from the URL. Here's an URL example:
+    # https://www.slideshare.net/FrisodeJong/iso-20022-for-dummies
+    #
+    # We're taking the path (`/FrisodeJong/iso-20022-for-dummies`),
+    # isolating the part after the second slash (the first slash
+    # always being at the first position), and appending `.pdf` to it.
+    if not args.out:
+        path = urllib.parse.urlparse(args.url[0]).path
+        args.out = '%s.pdf'%path[path.find('/', 1) + 1:]
+    return args

 def main():
+    """ Main function of the program.
+    Uses the global command-line arguments (not custom ones)! """
    args = parse_args()

    res = requests.get(args.url[0])
-
    tree = bs4.BeautifulSoup(res.text, "html.parser")
-    c = canvas.Canvas(args.output)
+    canvas = reportlab.pdfgen.canvas.Canvas(args.out)

    for img in tree.findAll("img", class_="slide_image"):
        img_url = img.attrs["data-full"]
-        page_width, page_height = c._pagesize
-        c.setPageRotation(90)
-        c.drawImage(img_url, 0, 0, page_height, page_width, preserveAspectRatio=True)
-        c.showPage()
-    c.save()
-
+        page_width, page_height = canvas._pagesize
+        canvas.drawImage(img_url, 0, 0, page_height, page_width,
+                         preserveAspectRatio=True)
+        canvas.setPageSize((page_height, page_width))
+        canvas.showPage()
+    canvas.save()

 if __name__ == '__main__':
    main()