pdf(Portable Document Format的简称,意为“便携式文档格式”),是由Adobe Systems用于与应用程序、操作系统、硬件无关的方式进行文件交换所发展出的文件格式。
pdftotext temp.pdf temp.txt
from selenium import webdriver
from lxml import etree
import time
driver = webdriver.Firefox()
driver.get("file:///E:/pdf信息提取/temp.pdf")
time.sleep(5)
html = driver.page_source
tree = etree.HTML(html)
content_xpath = "/html/body/div[1]/div[2]/div[4]/div/div/div[2]/span"
content_list = tree.xpath(content_xpath)
for c in content_list:
print(c.text)
driver.close()
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
file = open(r'E:/pdf信息提取/temp.pdf', 'rb')
praser = PDFParser(file)
doc = PDFDocument()
praser.set_document(doc)
doc.set_parser(praser)
doc.initialize()
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
results = x.get_text()
print(results)
set-ExecutionPolicy Remotesigned
wordconvert temp.pdf temp.html, replace encoding("gb18030")
clear
set obs 1
gen v = fileread("temp.html")
replace v = ustrfrom(v, "gb18030", 1)
replace v = ustrregexra(v, "\r|\n", "")
replace v = ustrregexra(v, "<style>.*?</style>", "")
replace v = ustrregexra(v, "<.*?>", "")
compress