#!/usr/bin/env python # coding=utf-8 from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO import os from os import path defconvert_pdf_to_txt(path): rsrcmgr =PDFResourceManager() retstr = StringIO() codec ='utf-8' laparams =LAParams() device = TextConverter(rsrcmgr, retstr,laparams = laparams) fp =open(path,'rb') interpreter =PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching =True pagenos= set()
for page in PDFPage.get_pages(fp,pagenos,maxpages=maxpages,password=password,caching=caching,check_extractable=True): interpreter.process_page(page)
text = retstr.getvalue()
fp.close() device.close() retstr.close() return text
保存成文本文件
1 2 3 4
defsaveTxt(txt, filename): with open(filename[:-3]+'txt', "w") as f: print('openTxt:' + filename[:-3]+'txt') f.write(txt)
转换一个文件夹中的所有pdf文件
1 2 3 4 5 6 7 8 9 10
deftraversal(rootdir): for parent, dirnames, filenames in os.walk(rootdir): for filename in filenames: filenameFull = os.path.join(parent, filename) if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')): txt = readPDF(filenameFull) saveTxt(txt.replace(u'\xa9', u'').replace(u'\xa0',u'').replace(u'\xad',u'').replace(u'\u037e',u''), filenameFull) if __name__ == '__main__': rootdir = './' traversal(rootdir)
我们拿其中的一篇pdf文件做解析[PDF] WHO R&D Blueprint novel Coronavirus prospects for evaluating cross-reactivity of nCoV with SARS-CoV January 24, 2020, Geneva, Switzerland.pdf