admin管理员组文章数量:1643280
前期工作:
-
注册
百度翻译api的账户(个人-高级版),注册后,每个月有2百万的免费翻译字符数。 -
安装pdfminer3k
一、UI界面设计
点击路径按钮时弹出文件目录选择窗口,参考文章:
PYQT5实现文件目录浏览
PyQt5-对话框控件使用(QFileDialog)
二、主程序
参考文章:python如何提取英语pdf内容并翻译
知道怎么调用百度翻译的api之后,把各个功能绑定到UI控件上。程序比较简单,结合注释理解即可。
# app.py
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'app.ui'
#
# Created by: PyQt5 UI code generator 5.13.0
#
# WARNING! All changes made in this file will be lost!
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_Form(object):
def setupUi(self, Form):
Form.setObjectName("Form")
Form.resize(577, 469)
self.groupBox = QtWidgets.QGroupBox(Form)
self.groupBox.setGeometry(QtCore.QRect(10, 120, 391, 241))
self.groupBox.setObjectName("groupBox")
self.bnt_add_file = QtWidgets.QPushButton(self.groupBox)
self.bnt_add_file.setGeometry(QtCore.QRect(290, 30, 75, 23))
self.bnt_add_file.setObjectName("bnt_add_file")
self.bnt_translate = QtWidgets.QPushButton(self.groupBox)
self.bnt_translate.setGeometry(QtCore.QRect(290, 200, 75, 23))
self.bnt_translate.setObjectName("bnt_translate")
self.files_listWidget = QtWidgets.QListWidget(self.groupBox)
self.files_listWidget.setGeometry(QtCore.QRect(10, 30, 256, 192))
self.files_listWidget.setObjectName("files_listWidget")
self.bnt_delete_file = QtWidgets.QPushButton(self.groupBox)
self.bnt_delete_file.setGeometry(QtCore.QRect(290, 70, 75, 23))
self.bnt_delete_file.setObjectName("bnt_delete_file")
self.groupBox_2 = QtWidgets.QGroupBox(Form)
self.groupBox_2.setGeometry(QtCore.QRect(10, 10, 391, 101))
self.groupBox_2.setObjectName("groupBox_2")
self.label = QtWidgets.QLabel(self.groupBox_2)
self.label.setGeometry(QtCore.QRect(30, 30, 54, 12))
self.label.setObjectName("label")
self.account = QtWidgets.QLineEdit(self.groupBox_2)
self.account.setGeometry(QtCore.QRect(90, 30, 241, 21))
self.account.setObjectName("account")
self.password = QtWidgets.QLineEdit(self.groupBox_2)
self.password.setGeometry(QtCore.QRect(90, 60, 241, 21))
self.password.setObjectName("password")
self.label_2 = QtWidgets.QLabel(self.groupBox_2)
self.label_2.setGeometry(QtCore.QRect(30, 60, 54, 12))
self.label_2.setObjectName("label_2")
self.retranslateUi(Form)
QtCore.QMetaObject.connectSlotsByName(Form)
def retranslateUi(self, Form):
_translate = QtCore.QCoreApplication.translate
Form.setWindowTitle(_translate("Form", "Translate"))
self.groupBox.setTitle(_translate("Form", "选择文件"))
self.bnt_add_file.setText(_translate("Form", "添加文件"))
self.bnt_translate.setText(_translate("Form", "全部翻译"))
self.bnt_delete_file.setText(_translate("Form", "删除文件"))
self.groupBox_2.setTitle(_translate("Form", "百度翻译"))
self.label.setText(_translate("Form", "帐号"))
self.label_2.setText(_translate("Form", "密码"))
# translate.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : {Jan__}
# @Time : 2021/2/11 15:17
import sys
from PyQt5.QtWidgets import QWidget, QFileDialog, QApplication
from app import Ui_Form
import importlib
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
#
import requests
import string
import time
import hashlib
import json
##初始化
api_url = "http://api.fanyi.baidu/api/trans/vip/translate"
api_id = "" ##申请的百度翻译接口的id
cyber = "" ##申请的百度翻译接口的password
# 处理PDF
# 读取PDF的内容 filename是待处理的PDF的名字
class MyUi(QWidget, Ui_Form):
def __init__(self):
super(MyUi, self).__init__() # 分别调用了2个父类的初始化函数
self.setupUi(self) # UI界面控件的初始化
self.signal_connect() # 信号与槽函数绑定
def signal_connect(self):
self.account.setText(api_id)
self.password.setText(cyber)
self.bnt_add_file.clicked.connect(self.bnt_add_file_slot)
self.bnt_delete_file.clicked.connect(self.bnt_delete_file_slot)
self.bnt_translate.clicked.connect(self.bnt_translate_slot)
def bnt_add_file_slot(self):
fnames, _ = QFileDialog.getOpenFileNames(self, '选择文件', "./", "Files(*.pdf *.txt)")
"""
参数一:设置父组件
参数二:QFileDialog的标题
参数三:默认打开的目录,“.”点表示程序运行目录,/表示当前盘符根目录
参数四:对话框的文件扩展名过滤器Filter,比如使用 Image files(*.jpg *.gif) 表示只能显示扩展名为.jpg或者.gif文件
设置多个文件扩展名过滤,使用双引号隔开;
“All Files(*);;PDF Files(*.pdf);;Text Files(*.txt)”
"""
try:
if fnames:
# 如果列表非空,则添加到文件列表中去
for f in fnames:
self.files_listWidget.addItem(f)
except Exception as ex:
print(ex)
def bnt_translate_slot(self):
Directory = QFileDialog.getExistingDirectory(self, '结果保存到目录', './')
num = self.files_listWidget.count()
# 遍历翻译所有文件
print("# 遍历翻译所有文件")
for _ in range(num):
filename = self.files_listWidget.item(0).text()
if filename.find('pdf') >= 3:
content = self.getDataFromPDF(filename)
elif filename.find('txt') >= 3:
content = self.getDataFromTxt(filename)
else:
content = ""
print("读取文件失败")
return
print("读取文件成功")
f = filename.split('/')
CNtextfile = Directory + '/CN_' + f[-1]
CNtextfile = CNtextfile.replace('.pdf', '.txt')
chinese = ""
clist = content.split(".") # split() 通过指定.将英文分成多个句子
# 遍历翻译所有句子
print("# 遍历翻译所有句子")
try:
for i in range(clist.__len__()):
chinese += (self.translate(clist[i] + '.'))
chinese += '\n'
self.saveText(chinese, CNtextfile)
print("翻译结束,ok")
self.files_listWidget.takeItem(0)
print("删除文件")
except Exception as ex:
print(ex)
def bnt_delete_file_slot(self):
num = self.files_listWidget.currentRow()
self.files_listWidget.takeItem(num)
print("删除文件")
###使用PDFminer读取
def getDataFromPDF(self, filename):
try:
parser = PDFParser(open(filename, 'rb')) # 以二进制打开文件 ,并创建一个pdf文档分析器
doc = PDFDocument() # 创建一个pdf文档
# 将文档对象和连接分析器连接起来
parser.set_document(doc)
doc.set_parser(parser)
# 初始化文档,当前文档没有密码,设为空字符串
doc.initialize("")
# 判断该pdf是否支持txt转换
if doc.is_extractable:
# 创建一个PDF资源管理器
rsrcmgr = PDFResourceManager()
# 创建一个参数分析器
laparamas = LAParams()
# 创建一个聚合器
device = PDFPageAggregator(rsrcmgr, laparams=laparamas)
# 创建一个PDF页面解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
contents = "" # 保存读取的text
# 依次读取每个page的内容
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
# 在windows下,新文件的默认编码是gbk编码,所以我们在写入文件的时候需要设置一个编码格式,如下:
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
results = x.get_text()
results = results.replace("(cid:2) ", "") # 去掉连词符
results = results.replace("\n", "") # 去掉换行符 因为排版问题 有的换行导致句子中断
contents += (results)
return contents
except Exception as ex:
print(ex)
def getDataFromTxt(self, filename):
try:
with open(filename, "r", encoding='utf-8') as f:
text = f.read()
print(text)
content = text.replace("\n", "") # 去掉换行符 因为排版问题 有的换行导致句子中断
f.close()
return content
except Exception as ex:
print(ex)
# 将读取的content以txt格式存放到本地
def saveText(self, content, Textfile):
with open(Textfile, "w", encoding='utf-8') as f:
f.write(content)
# 翻译从pdf提取的content
def translate(self, content):
try:
salt = str(time.time())[:10]
final_sign = str(self.account.text()) + content + salt + self.password.text()
final_sign = hashlib.md5(final_sign.encode("utf-8")).hexdigest()
# from to 代表翻译的语言
paramas = {
'q': content,
'from': 'en',
'to': 'zh',
'appid': '%s' % self.account.text(),
'salt': '%s' % salt,
'sign': '%s' % final_sign
}
response = requests.get(api_url, params=paramas).content
content = str(response, encoding="utf-8")
json_reads = json.loads(content)
if 'trans_result' in json_reads:
return json_reads['trans_result'][0]['dst'] + " "
else:
return str(json_reads)
except Exception as ex:
print(ex)
if __name__ == '__main__':
try:
app = QApplication(sys.argv) # 实例化一个应用对象,sys.argv是一组命令行参数的列表。Python可以在shell里运行,这是一种通过参数来选择启动脚本的方式。
myshow = MyUi()
myshow.show()
sys.exit(app.exec_()) # 确保主循环安全退出
except Exception as ex:
print(ex)
三、问题小结
选择文件时报错:
log4cplus:ERROR No appenders could be found for logger (AdSyncNamespace).
log4cplus:ERROR Please initialize the log4cplus system properly.
解决办法:
目录不要含有中文
打开txt文件时报错:
'utf-8' codec can't decode byte 0xa1 in position 8: invalid start byte
解决办法:
txt文件保存时,编码格式需选择utf-8,参考文章:python 报错"UnicodeDecodeError: ‘utf-8’ codec can’t decode byte"的解决办法
不足:
简单翻译英文段落没问题,想翻译期刊文献就不行了,图、表、分栏这些干扰太多了。
版权声明:本文标题:用python写一个PDF翻译软件 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://www.elefans.com/dianzi/1725599299a1032260.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论