This repository has been archived by the owner on Mar 9, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdftitle.py
74 lines (67 loc) · 2.8 KB
/
pdftitle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
###
# Copyright (c) 2017, Brandon Roberts <brandon@bxroberts.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
import PyPDF2
import io
def pdf2information(content):
"""
Take a PDF page's content (data) and extract a title & info from
the PDF metadata if it exists. Many PDFs omit this information
so maybe a fallback could be extracting the first N chars
from the document or to look for bookmarks or related metadata.
Returns a dict with pages, title keys.
"""
# we'll collect our information about the document here
info = {
"pages": '',
"title": ''
}
try:
o = io.BytesIO(content)
reader = PyPDF2.PdfFileReader(o)
except Exception, e:
print "PDF parse error: {}".format(e)
return ' '.join(response)
try:
title = reader.getDocumentInfo().title
except Exception, e:
print "error getting pdf documentinfo/title: {}".format(e)
else:
if title:
info["title"] = title
try:
pages = reader.getNumPages()
except Exception, e:
print "PDF getNumPages error: {}".format(e)
else:
if pages == 1:
info["pages"] = '{} page'.format(pages)
elif pages > 1:
info["pages"] = '{} pages'.format(pages)
# will return blank string if nothing worked above
return info