Merge pull request #1 from cakeisalie5/master
Corrected one or two things.
This commit is contained in:
12
.editorconfig
Normal file
12
.editorconfig
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Editor config -- see http://editorconfig.org/
|
||||||
|
root = true
|
||||||
|
|
||||||
|
# Unix-style newlines
|
||||||
|
[*]
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# 4 space indentation
|
||||||
|
[*.py]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
@@ -1,40 +1,56 @@
|
|||||||
|
""" Slideslurp scrapes a PDF from SlideShare.net (LinkedIn). """
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
import urllib
|
||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
|
import reportlab.pdfgen.canvas
|
||||||
from reportlab.pdfgen import canvas
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
""" Parse the command-line arguments and return them as an object.
|
||||||
|
The returned object has the following properties:
|
||||||
|
- `url` (string): the URL of the slideshare to download;
|
||||||
|
- `out` (string): the output file path. """
|
||||||
descr = "Generate PDFs from Slideshare presentations"
|
descr = "Generate PDFs from Slideshare presentations"
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=descr)
|
parser = argparse.ArgumentParser(description=descr)
|
||||||
parser.add_argument('url', metavar='url', nargs=1,
|
parser.add_argument('url', metavar='url', nargs=1,
|
||||||
help='the URL to slurp')
|
help='the URL to slurp')
|
||||||
parser.add_argument('--output', '-o', default='out.pdf',
|
parser.add_argument('--out', '--output', '-o', default=None,
|
||||||
help='the file to write to (default: out.pdf)')
|
help='force the output file name')
|
||||||
|
|
||||||
return parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# If the user doesn't specify an output filename, we'll deduce
|
||||||
|
# one from the URL. Here's an URL example:
|
||||||
|
# https://www.slideshare.net/FrisodeJong/iso-20022-for-dummies
|
||||||
|
#
|
||||||
|
# We're taking the path (`/FrisodeJong/iso-20022-for-dummies`),
|
||||||
|
# isolating the part after the second slash (the first slash
|
||||||
|
# always being at the first position), and appending `.pdf` to it.
|
||||||
|
if not args.out:
|
||||||
|
path = urllib.parse.urlparse(args.url[0]).path
|
||||||
|
args.out = '%s.pdf'%path[path.find('/', 1) + 1:]
|
||||||
|
return args
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
""" Main function of the program.
|
||||||
|
Uses the global command-line arguments (not custom ones)! """
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
res = requests.get(args.url[0])
|
res = requests.get(args.url[0])
|
||||||
|
|
||||||
tree = bs4.BeautifulSoup(res.text, "html.parser")
|
tree = bs4.BeautifulSoup(res.text, "html.parser")
|
||||||
c = canvas.Canvas(args.output)
|
canvas = reportlab.pdfgen.canvas.Canvas(args.out)
|
||||||
|
|
||||||
for img in tree.findAll("img", class_="slide_image"):
|
for img in tree.findAll("img", class_="slide_image"):
|
||||||
img_url = img.attrs["data-full"]
|
img_url = img.attrs["data-full"]
|
||||||
page_width, page_height = c._pagesize
|
page_width, page_height = canvas._pagesize
|
||||||
c.setPageRotation(90)
|
canvas.drawImage(img_url, 0, 0, page_height, page_width,
|
||||||
c.drawImage(img_url, 0, 0, page_height, page_width, preserveAspectRatio=True)
|
preserveAspectRatio=True)
|
||||||
c.showPage()
|
canvas.setPageSize((page_height, page_width))
|
||||||
c.save()
|
canvas.showPage()
|
||||||
|
canvas.save()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
Reference in New Issue
Block a user