Merge pull request #1 from cakeisalie5/master

Corrected one or two things.
This commit is contained in:
2017-05-06 17:07:13 +02:00
committed by GitHub
2 changed files with 43 additions and 15 deletions

12
.editorconfig Normal file
View File

@@ -0,0 +1,12 @@
# Editor config -- see http://editorconfig.org/
root = true
# Unix-style newlines
[*]
end_of_line = lf
insert_final_newline = true
# 4 space indentation
[*.py]
indent_style = space
indent_size = 4

View File

@@ -1,40 +1,56 @@
""" Slideslurp scrapes a PDF from SlideShare.net (LinkedIn). """
import argparse import argparse
import sys import sys
import urllib
import bs4 import bs4
import requests import requests
import reportlab.pdfgen.canvas
from reportlab.pdfgen import canvas
def parse_args(): def parse_args():
""" Parse the command-line arguments and return them as an object.
The returned object has the following properties:
- `url` (string): the URL of the slideshare to download;
- `out` (string): the output file path. """
descr = "Generate PDFs from Slideshare presentations" descr = "Generate PDFs from Slideshare presentations"
parser = argparse.ArgumentParser(description=descr) parser = argparse.ArgumentParser(description=descr)
parser.add_argument('url', metavar='url', nargs=1, parser.add_argument('url', metavar='url', nargs=1,
help='the URL to slurp') help='the URL to slurp')
parser.add_argument('--output', '-o', default='out.pdf', parser.add_argument('--out', '--output', '-o', default=None,
help='the file to write to (default: out.pdf)') help='force the output file name')
return parser.parse_args() args = parser.parse_args()
# If the user doesn't specify an output filename, we'll deduce
# one from the URL. Here's an URL example:
# https://www.slideshare.net/FrisodeJong/iso-20022-for-dummies
#
# We're taking the path (`/FrisodeJong/iso-20022-for-dummies`),
# isolating the part after the second slash (the first slash
# always being at the first position), and appending `.pdf` to it.
if not args.out:
path = urllib.parse.urlparse(args.url[0]).path
args.out = '%s.pdf'%path[path.find('/', 1) + 1:]
return args
def main(): def main():
""" Main function of the program.
Uses the global command-line arguments (not custom ones)! """
args = parse_args() args = parse_args()
res = requests.get(args.url[0]) res = requests.get(args.url[0])
tree = bs4.BeautifulSoup(res.text, "html.parser") tree = bs4.BeautifulSoup(res.text, "html.parser")
c = canvas.Canvas(args.output) canvas = reportlab.pdfgen.canvas.Canvas(args.out)
for img in tree.findAll("img", class_="slide_image"): for img in tree.findAll("img", class_="slide_image"):
img_url = img.attrs["data-full"] img_url = img.attrs["data-full"]
page_width, page_height = c._pagesize page_width, page_height = canvas._pagesize
c.setPageRotation(90) canvas.drawImage(img_url, 0, 0, page_height, page_width,
c.drawImage(img_url, 0, 0, page_height, page_width, preserveAspectRatio=True) preserveAspectRatio=True)
c.showPage() canvas.setPageSize((page_height, page_width))
c.save() canvas.showPage()
canvas.save()
if __name__ == '__main__': if __name__ == '__main__':
main() main()