python中正则表达式与jieba分词的使用

编程入门 行业动态 更新时间:2024-10-25 10:34:46

python中正则表达式与jieba<a href=https://www.elefans.com/category/jswz/34/1763864.html style=分词的使用"/>

python中正则表达式与jieba分词的使用

这次和大家分享一下主要使用正则表达式匹配文本信息内容的案例,其中还用到了jieba分词词性标注技术,和一些对文本的切片工作。有兴趣学习的可以详细看看,具体内容,应该有点帮助,这是本人一个一个代码敲出来的1000多行代码。

# coding:utf-8
import os
import docx
import pickle
import time
import datetime
import re
import sys
import jieba.posseg as psg
import numpy as npcurPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)# 获取json返回结构
# params: wtjdxx   委托鉴定信息
# params: jd_tj_ff 鉴定条件与方法
# params: jdqk     鉴定情况
# params: wbxx     文本信息def _extract_result(wtjdxx,jdff_tj,jdqk,wbxx):# 案件NLP返回json对象result_json = {}#委托鉴定信息WTJDXX = {}# 设定判决书类型,JYBG_JDSws_type = 'JYBG_JDS'
#################委托鉴定信息开始**************_get_bmsah(WTJDXX,wtjdxx)_get_slrq(WTJDXX,wtjdxx)# _get_rwlylb(WTJDXX,wtjdxx)_get_rwlydw(WTJDXX,wtjdxx)_get_wtdwjb(WTJDXX,wtjdxx)# _get_ajmc(WTJDXX,wtjdxx)# _get_ay(WTJDXX,wtjdxx)# _get_sjrlxdh(WTJDXX,wtjdxx)_get_sjr(WTJDXX,wtjdxx)_get_aqzy(WTJDXX,wtjdxx)# _get_ssjd(WTJDXX,wtjdxx)# _get_zttjbs(WTJDXX,wtjdxx)_get_sjzy(WTJDXX,wtjdxx)_get_wtyq(WTJDXX,wtjdxx)_get_jdlb(WTJDXX,wtjdxx)result_json["WTJDXX"] = WTJDXX
#################委托鉴定信息结束**************#################鉴定方法与条件开始**************JDFF_TJ= {}_get_sysbjs(JDFF_TJ,jdff_tj)_get_sydbzgf(JDFF_TJ,jdff_tj)_get_sydbzohhc(JDFF_TJ,jdff_tj)result_json["JDFF_TJ"] = JDFF_TJ#################鉴定方法与条件开始**************#################鉴定情况开始**************JDQK = {}_get_ksjdrq(JDQK,jdqk)_get_bjrq(JDQK,jdqk)_get_bljg(JDQK,jdqk)_get_jdslkrq(JDQK,jdqk)_get_wtrchrq(JDQK,jdqk)_get_qtyyzzrq(JDQK,jdqk)_get_dyjdr(JDQK,jdqk)_get_dejdr(JDQK,jdqk)_get_dsjdr(JDQK,jdqk)_get_qtjdr(JDQK,jdqk)_get_jybgfs_jdsfs(JDQK,jdqk)_get_wpzsl(JDQK,jdqk)result_json["JDQK"] = JDQK#################鉴定情况结束**************return result_json# 抽取命名实体结果
# json抽取实体
'''{'WTJDXX': [{'BMSAH': '',# 部门受案号'SLRQ': '',#受理日期'RWLYLB': '',#任务来源类别'RWLYDW': '',#任务来源单位'WTDWJB': '',#委托单位级别'AJMC': '',#案件名称'AY': '',#案由'SJR': '',#送检人'SJRLXDH': '',送检人联系电话'SSJD': ,#诉讼阶段'JDLB': '',#鉴定类别'SJZY': '',#涉及专业'ZTTJBS': '',# 专题统计标识'AQZY': '',# 案情摘要'WTYQ': ''# 委托要求}],'JDFF_TJ': [{'SYSBJS': '',#使用设备技术'SYDBZGF': '',# 使用的标准规范'SYDBZPHHC': '',# 使用的标准品和耗材}]``'JDQK':[ {'KSJDRQ': '',# 开始鉴定日期'BJRQ': '',# 办结日期'JDSLKRQ': '',# 鉴定书落款日期'WTRCHRQ': '',# 委托人撤回日期'QTYYZZRQ': '',# 其他原因终止日期'BLJG': '', # 办理结果'DYJDR': '',# 第一鉴定人'DRJDR': '',# 第二鉴定人'DSJDR': '',# 第三鉴定人'QTCYR': '',# 其他参与人'JYBGFS': '',#检验报告份数'JDSFS': ''# 鉴定书份数'WPZJSL': '',#外聘专家数量}]
}'''def extract_ner(slice_result):# try:if is_debug:print('【委托鉴定信息】' + slice_result['委托鉴定信息'])print('【鉴定条件与方法】' + slice_result['鉴定条件与方法'])print('【鉴定情况】' + slice_result['鉴定情况'])item = _extract_result(slice_result['委托鉴定信息'], slice_result['鉴定条件与方法'], slice_result['鉴定情况'],slice_result['全文检索'])# except BaseException as e:#     if is_debug:#         print('命名实体提取异常' + e)return itemdef data_slice(wbxx):# 文本切片分组集合slice_result = {}# 临时文本内容wbxx= wbxx.replace(' ','')content = wbxxtry:#切片委托鉴定信息wtjdxx = []wtjdxx = re.findall('(?:检验报告).*?(?=(文本摘要|检验过程))',content)if not wtjdxx:wtjdxx=re.findall('(检验报告\s+(\S{1,30}?号)).*?(?=资料摘要)',content)if not wtjdxx:wtjdxx = re.findall('(鉴定书\s+(\S{1,30}?号)).*?(?=(文本摘要|资料摘要))',content)if not wtjdxx:wtjdxx = re.findall('鉴定书\s+(\S{1,30}?号).*?(?=[一二三四五六七八九十]?、检验:',content)if not wtjdxx:wtjdxx = contentslice_result['委托鉴定信息'] =wtjdxxwtjdxx=wtjdxx.replace(' ','')#切片签订条件与方法jdff_tj = []jdff_tj = wtjdxx.replace(' ', '')jdff_tj=re.findall('(?:检验过程:).*?(?=[一二三四五六七八九十]?、检验结果)', content)if not jdff_tj:jdff_tj = contentslice_result['鉴定条件与方法'] = jdff_tj#切片鉴定情况jdqk = []jdqk = contentslice_result['鉴定情况'] = jdqkjdqk = jdqk.replace(' ','')slice_result["全文检索"] = wbxxwbxx = wbxx.replace(' ','')except Exception as e:if is_debug:print('data_slice 切片异常' + e)return slice_result# 主程序
def main(wbxx):# 开始计时t1 = datetime.datetime.now().microsecondt3 = time.mktime(datetime.datetime.now().timetuple())# 数据切片按照业务逻辑进行文本切片slice_result = data_slice(wbxx)# 抽取命名实体标签extract_result = extract_ner(slice_result)t2 = datetime.datetime.now().microsecondt4 = time.mktime(datetime.datetime.now().timetuple())if is_debug:print('NLP解析结果耗时:%dms' % ((t4 - t3) * 1000 + (t2 - t1) / 1000))return extract_result#*******************************委托鉴定信息start*********************************
# "BMSAH": '',  # 部门受案号
def _get_bmsah(WTJDXX,wtjdxx):try:#鲁济检技鉴〔2015〕59号   滨检技鉴〔2014〕11号   青检技鉴〔2018〕1号WTJDXX['BMSAH'] = ''.join(re.findall('(?<=检验报告).*?(\S{1,30}?号)', wtjdxx, re.S))if not WTJDXX['BMSAH']:WTJDXX['BMSAH'] = ''.join(re.findall('(?<=检验鉴定文书).*?(\S{1,30}?号)', wtjdxx, re.S))except Exception as e:if is_debug:print('获取部门受案号异常:' + e.__str__())WTJDXX['BMSAH']= ''#"SLRQ": ' ', #受理日期
def _get_slrq(WTJDXX,wtjdxx):try:wtjdxx=wtjdxx.replace(' ','')#委托日期:2017年10月11日  委托日期:   委托日期:2014年7月17日   委托日期:2014.1.15WTJDXX['SLRQ'] =''.join(re.findall('(?<=委托日期:)'+ ('\d{4}'+'年?.?\s?'+'\d{1,2}'+'月?.?\s?'+'\d{1,2}'+'日?.?\s'),wtjdxx))#针对有的文书 写错了的情况下if not WTJDXX['SLRQ']:WTJDXX['SLRQ'] =''.join(re.findall('(?<=委托日期:)'+ ('\d{4}'+'你那?.?\s?'+'\d{1,2}'+'月?.?\s?'+'\d{1,2}'+'日?.?\s'),wtjdxx))if not WTJDXX['SLRQ']:WTJDXX['SLRQ'] = ''.join(re.findall('(?<=委托时间:)' + ('\d{4}' + '年?.?\s?' + '\d{1,2}' + '月?.?\s?' + '\d{1,2}' + '日?.?\s'), wtjdxx))except Exception as e:if is_debug:print('获取受理日期异常:' + e.__str__())WTJDXX['SLRQ']= ''#'RWLYLB': '',#任务来源类别
# def _get_rwlylb(wtjdxx):
#     pass#'RWLYDW': '',#任务来源单位   获取委托单位
def _get_rwlydw(WTJDXX,wtjdxx):try:#委托单位:青岛市人民检察院刑事执行检察处WTJDXX['RWLYDW'] = ''.join(re.findall('(?<=委托单位:).*?(?=委托日期)',wtjdxx, re.DOTALL))if not WTJDXX['RWLYDW']:WTJDXX['RWLYDW'] = ''.join(re.findall('(?<=委托单位:).*?(?=委托时间)', wtjdxx, re.DOTALL))except Exception as e:if is_debug:print('获取任务来源单位异常:' + e.__str__())WTJDXX['RWLYDW']  = ''#'WTDWJB': '',#委托单位级别   (县区级院  地市级院 省级院 高检院 系统外单位)
def _get_wtdwjb(WTJDXX,wtjdxx):try:WTDW = re.findall('(?<=委托单位:).*?(?=委托日期)',wtjdxx, re.DOTALL)if not WTDW:WTDW = re.findall('(?<=委托单位:).*?(?=委托时间)', wtjdxx, re.DOTALL)#青岛市人民检察院刑事执行检察处 槐荫区人民检察院 无棣县院 莱芜市人民检察院if '县' or '区' in WTDW[0]:WTJDXX['WTDWJB'] = '县区级院'if '市' in WTDW[0]:WTJDXX['WTDWJB'] = '地市级院'if '省' in WTDW[0]:WTJDXX['WTDWJB'] = '省级院'if '最高检' in WTDW[0]:WTJDXX['WTDWJB'] = '高检院'# else:#     WTJDXX['WTDWJB'] = '系统外单位'except Exception as e:if is_debug:print('获取委托单位级别异常:' + e.__str__())WTJDXX['WTDWJB']= ''#获取鉴定类别 如果出现什么就是什么 如果没有 则返回  首次鉴定
def _get_jdlb(WTJDXX,wtjdxx):try:wtjdxx=wtjdxx.replace(' ','').replace('  ','').replace('   ','')if '补充鉴定' in wtjdxx:WTJDXX['JDLB'] = '补充鉴定'if '重新鉴定' in wtjdxx:WTJDXX['JDLB'] = '重新鉴定'else:WTJDXX['JDLB'] = '首次鉴定'except Exception as e:if is_debug:print('获取鉴定类别异常:' + e.__str__())WTJDXX['JDLB'] = ''#'AJMC': '',#案件名称
# def _get_ajmc(AYMC,wtjdxx,wbxx):
#     pass#'AY': '',#案由
# def _get_ay(AY,wtjdxx,wbxx):
#     pass#'SJR': '',#送检人
def _get_sjr(WTJDXX,wtjdxx):try:wtjdxx = wtjdxx.replace(' ','').replace('  ','').replace('   ','')#送 检 人:蔡文锋 李成业WTJDXX['SJR'] = ''.join(re.findall('(?<=送检人:).*?(?=送检材料)', wtjdxx, re.DOTALL))except Exception as e:if is_debug:print('获取送检人异常:' + e.__str__())WTJDXX['SJR'] = ''#'SJRLXDH': '',送检人联系电话
# def _get_sjrlxdh(SJRLXDH,wtjdxx,wbxx):
#     pass#'SSJD': ,#诉讼阶段
# def _get_ssjd(SSJD,wttjdxx,wbxx):
#     pass#'SJZY': '',#涉及专业
# 济南民营科技产业园西沙实业公司2012年11月第5号记帐凭证及所附原始凭证。表示司法会计 记帐凭证 收入 账页 明账 细账'
# 伤情是否存在及伤情程度  表示法医临床
# 对潘洪光尸体进行尸表检验,查明是否有体表伤。标识法医病理   等等
def _get_sjzy(WTJDXX,wtjdxx):try:#涉及专业信息和内容wtjdxx=wtjdxx.replace(' ','').replace('  ','').replace('   ','').replace('    ','')sjzyxx = re.compile('送检材料:(.*?)检验开始日期:',re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)检验开始时间:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)鉴定开始日期:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)鉴定开始时间:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)开始检验日期:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)开始检验时间:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)开始鉴定日期:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)开始鉴定时间:', re.S)sjzynr = sjzyxx.findall(wtjdxx)if not sjzynr:sjzyxx = re.compile('送检材料:(.*?)检验:?', re.S)sjzynr = sjzyxx.findall(wtjdxx)if '尸体' in sjzynr[0] or '尸表' in sjzynr[0]:WTJDXX['SJZY'] = '法医病理'if '记账凭证' in sjzynr[0] or '凭证' in sjzynr[0] or '收入' in sjzynr[0] or '资金' in sjzynr[0] or '工资' in sjzynr[0] or '明账' in sjzynr[0] or '细账' in sjzynr[0] or '账页' in sjzynr[0] or '数额' in sjzynr[0] or '总额' in sjzynr[0] or '票据' in sjzynr[0] or '金额' in sjzynr[0] or '报销' in sjzynr[0] or '资金' in sjzynr[0] or '赃款' in sjzynr[0] or '贷款' in sjzynr[0] or '补偿款' in sjzynr[0] or '会计' in sjzynr[0]:WTJDXX['SJZY'] = '司法会计'if '伤情' in sjzynr[0] or '损伤' in sjzynr[0] or '法医' in sjzynr[0]:WTJDXX['SJZY'] = '法医临床'if '数据' in sjzynr[0] or '恢复' in  sjzynr[0]  or '提取' in  sjzynr[0] or 'U盘' in  sjzynr[0] or '微信' in  sjzynr[0] or 'QQ' in  sjzynr[0] or 'qq' in  sjzynr[0] or '短信' in  sjzynr[0]:WTJDXX['SJZY'] = '电子数据'if '精神病' in sjzynr[0]:WTJDXX['SJZY'] = '法医精神病'if '毒物' in sjzynr [0]:WTJDXX['SJZY'] = '法医毒物'if '塑料' in sjzynr[0] or '橡胶' in sjzynr[0] or '玻璃' in sjzynr[0]:WTJDXX['SJZY'] = '微量物证'# else:#     WTJDXX['SJZY'] = ''except Exception as e:if is_debug:print('获取涉及专业异常:' + e.__str__())WTJDXX['SJZY'] = ''#'ZTTJBS': '',# 专题统计标识
# def _get_zttjbs(wtjdxx):
#     pass#'AQZY': '',# 案情摘要
#一、案情摘要|摘要案情
#李风宝,男,1968年5月8日出生。因犯抢劫罪被判死刑缓期执行,2009年到山东省青岛监狱服刑,2012年2月27日减为无期徒刑,2014年11月27日减为有期徒刑18年6个月。该犯于2017年10月10日21时20分突然出现心跳呼吸骤停,经监狱医院抢救,病情不稳,于当日22时06分许由120急救车转至青岛市城阳区人民医院抢救治疗。当日22时30分,因抢救无效,宣布临床死亡。
def _get_aqzy(WTJDXX,wtjdxx):try:wtjdxx = wtjdxx.replace(' ', '').replace('  ', '').replace('   ', '')WTJDXX['AQZY'] =''.join(re.findall('(?<=案情摘要).*?(?=文本摘要)',wtjdxx,re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=文本摘要)', wtjdxx, re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=案情摘要).*?(?=[一二三四五六七八九十]?、?资料摘要)', wtjdxx, re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=[一二三四五六七八九十]?、?资料摘要)', wtjdxx, re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=[一二三四五六七八九十]?、?鉴定过程)', wtjdxx, re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=案情摘要).*?(?=[一二三四五六七八九十]?、?鉴定过程)', wtjdxx, re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=[一二三四五六七八九十]?、?检验)',wtjdxx,re.DOTALL))if not WTJDXX['AQZY']:WTJDXX['AQZY'] = ''.join(re.findall('(?<=案情摘要).*?(?=[一二三四五六七八九十]?、?检验)',wtjdxx,re.DOTALL))except Exception as e:if is_debug:print('获取案情摘要异常:' + e.__str__())WTJDXX['AQZY'] = ''#'WTYQ': ''# 委托要求
# 四、委托要求: 查明犯罪嫌疑人王中在2013年01月至2016年04月任峄城区榴园镇卫生院院长期间基本公共卫生服务项目经费支出中发放兼职人员没有参加兼职工作的工资补助数额。
def _get_wtyq(WTJDXX,wtjdxx):try:wtjdxx=wtjdxx.replace(' ','').replace('  ','').replace('   ','')WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验开始日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验开始时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=鉴定开始时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=鉴定开始日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始鉴定时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始鉴定日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始检验时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始检验日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验过程)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=[一二三四五六七八九十]?、受理日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验开始日期)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验开始时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=鉴定开始时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=鉴定开始日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始鉴定时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始鉴定日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始检验时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始检验日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验过程)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=[一二三四五六七八九十]?、受理日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验开始日期)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验开始时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=鉴定开始时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=鉴定开始日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始鉴定时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始鉴定日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始检验时间)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始检验日期)',wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验过程)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验)', wtjdxx, re.DOTALL))if not WTJDXX['WTYQ']:WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=[一二三四五六七八九十]?、受理日期)',wtjdxx, re.DOTALL))except Exception as e:if is_debug:print('获取委托要求异常:' + e.__str__())WTJDXX['WTYQ'] = ''
#*************************委托鉴定信息END************************************#*************************鉴定方法与条件START********************************
# 'SYSBJS': '',#使用设备技术
# eg.(一)检验设备
# 美亚柏科便携式手机取证一体机(设备编号:C-02);电子检材屏蔽箱C-03;手机大师一体机版取证软件C-05(版本号:V2.6.22301RTM)
def _get_sysbjs(JDFF_TJ,jdff_tj):try:jdff_tj = jdff_tj.replace(' ', '').replace('  ', '').replace('   ', '')JDFF_TJ['SYSBJS'] = ''.join(re.findall('(?<=1、检验设备).*?(?=2、检验方法)',jdff_tj,re.DOTALL))if not JDFF_TJ['SYSBJS']:JDFF_TJ['SYSBJS'] = ''.join(re.findall('(?<=(一)检验设备).*?(?=(二)检验方法)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYSBJS']:JDFF_TJ['SYSBJS'] = ''.join(re.findall('(?<=检验设备:).*?(?=检验软件)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYSBJS']:JDFF_TJ['SYSBJS'] =''.join(re.findall('(?<=检验过程).*?(?=[一二三四五六七八九十]?、?检验结果)',jdff_tj,re.DOTALL))except Exception as e:if is_debug:print('获取使用设备技术异常:' + e.__str__())JDFF_TJ['SYSBJS']=''# 'SYDBZGF': '',# 使用的标准规范 检验方法:《法庭科学电子物证手机检验技术规范》GA/T1069-2013;《电子物证数据恢复检验规程》GBT29360-2012;《电子物证文件一致性检验规程》GBT29361-2012。
def _get_sydbzgf(JDFF_TJ,jdff_tj):try:jdff_tj = jdff_tj.replace(' ', '').replace('  ', '').replace('   ', '')JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[一二三四五六七八九十]?、?检验步骤)',jdff_tj,re.DOTALL))if not JDFF_TJ['SYDBZGF']:JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[123456789]?、?检验步骤)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYDBZGF']:JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[123456789]?、?对送检检材)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYDBZGF']:JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[一二三四五六七八九十]?、?鉴定意见)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYDBZGF']:JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验软件:).*?(?=检验过程)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYDBZGF']:JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验过程).*?(?=[一二三四五六七八九十]?、检验结果)', jdff_tj,re.DOTALL))except Exception as e:if is_debug:print('获取使用的标准规范异常:' + e.__str__())JDFF_TJ['SYDBZGF']=''# 'SYDBZPHHC': '',# 使用的标准品和耗材
def _get_sydbzohhc(JDFF_TJ,jdff_tj):try:jdff_tj = jdff_tj.replace(' ', '').replace('  ', '').replace('   ', '')JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验软件).*?(?=[一二三四五六七八九十]?、?检验过程)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYDBZPHHC']:JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验软件).*?(?=[一二三四五六七八九十]?、?检验方法)', jdff_tj, re.DOTALL))if not JDFF_TJ['SYDBZPHHC']:JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验过程).*?(?=[一二三四五六七八九十]?、?检验结果)', jdff_tj,re.DOTALL))if not JDFF_TJ['SYDBZPHHC'] :JDFF_TJ['SYDBZPHHC']  = ''.join(re.findall('(?<=检验步骤).*?(?=[一二三四五六七八九十]?、?检验结果)',jdff_tj,re.DOTALL))except Exception as e:if is_debug:print('获取使用的标准品和耗材异常:' + e.__str__())JDFF_TJ['SYDBZPHHC'] =''#*************************鉴定方法与条件END**********************************#***************************鉴定情况START***********************************
# 'KSJDRQ': '',# 开始鉴定日期
def _get_ksjdrq(JDQK,jdqk):try:jdqk=jdqk.replace(' ','').replace('  ','').replace('   ','')JDQK['KSJDRQ'] = ''.join(re.findall('(?<=开始鉴定日期:)'+('\d{4}\s?年\d{1,2}\s?月\d{1,2}\s?日'),jdqk))if not JDQK['KSJDRQ']:JDQK['KSJDRQ']=''.join(re.findall('(?<=开始检验日期:)'+('\d{4}\s?年\d{1,2}\s?月\d{1,2}\s?日'),jdqk))if not JDQK['KSJDRQ']:JDQK['KSJDRQ']=''.join(re.findall('(?<=鉴定开始日期:)'+('\d{4}年\s?\d{1,2}\s?月\d{1,2}\s?日'),jdqk))if not JDQK['KSJDRQ']:JDQK['KSJDRQ']=''.join(re.findall('(?<=检验开始日期:)'+('\d{4}\s?年\d{1,2}\s?月\d{1,2}\s?日'),jdqk))except Exception as e:if is_debug:print("获取开始鉴定日期异常:" + e.__str__())JDQK['KSJDRQ'] = ''# 'BJRQ': '',# 办结日期  最后面的日期2017年6月16日
def _get_bjrq(JDQK,jdqk):try:jdqk = jdqk.replace(' ', '').replace('  ', '').replace('   ', '')JDQK['BJRQ']=''.join(re.findall('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日',jdqk)[-1])except Exception as e:if is_debug:print("获取办理日期异常:" + e.__str__())JDQK['BJRQ'] = ''# 'JDSLKRQ': '',# 鉴定书落款日期  最后面的日期2017年6月16日
def _get_jdslkrq(JDQK,jdqk):try:jdqk = jdqk.replace(' ', '').replace('  ', '').replace('   ', '')JDQK['JDSLKRQ']=''.join(re.findall('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日',jdqk)[-1])except Exception as e:if is_debug:print("获取鉴定书落款日期异常:" + e.__str__())JDQK['JDSLKRQ'] = ''# 'WTRCHRQ': '',# 委托人撤回日期 在文书中查找 如果没有则不返回
def _get_wtrchrq(JDQK,jdqk):try:jdqk = jdqk.replace(' ', '').replace('  ', '').replace('   ', '')JDQK['WTRCHRQ']=''.join(re.findall('(?<=委托人撤回日期:)'+('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日'),jdqk))except Exception as e:if is_debug:print('获取委托人撤回日期异常:' + e.__str__())JDQK['WTRCHRQ'] = ''# 'QTYYZZRQ': '',# 其他原因终止日期  在文书中查找 如果没有 则不返回
def _get_qtyyzzrq(JDQK,jdqk):try:jdqk = jdqk.replace(' ', '').replace('  ', '').replace('   ', '')JDQK['WTRCHRQ']=''.join(re.findall('(?<=其他原因终止日期:)'+('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日'),jdqk))except Exception as e:if is_debug:print('获取其他原因终止日期异常:' + e.__str__())JDQK['WTRCHRQ'] = ''# # 'BLJG': '', # 办理结果 在文书中查找关键字 等于(检验结果 检验意见 检验结论 附件 检验人 鉴定人)附件优先
def _get_bljg(JDQK,jdqk):try:JDQK['BLJG'] = re.findall('(?<=检验结果:).*?(?=附件)', jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结果).*?(?=附件)', jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验意见:).*?(?=附件)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验意见).*?(?=附件)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结论:).*?(?=附件)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结论).*?(?=附件)', jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结果).*?(?=检验人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结果:).*?(?=检验人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结果:).*?(?=鉴定人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结果).*?(?=鉴定人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结论).*?(?=检验人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结论:).*?(?=检验人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结论:).*?(?=鉴定人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验结论).*?(?=鉴定人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验意见:).*?(?=鉴定人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验意见).*?(?=鉴定人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验意见:).*?(?=检验人)',jdqk, re.DOTALL)if not JDQK['BLJG']:JDQK['BLJG'] = re.findall('(?<=检验意见).*?(?=检验人】)',jdqk, re.DOTALL)JDQK['BLJG']=''.join(JDQK['BLJG'])except Exception as e:if is_debug:print('获取办理结果异常:' + e.__str__())JDQK['BLJG']=''# 'DYJDR': '',# 第一鉴定人   1.检验人:郑志宏  李东升 2014年2月28日  2.检验人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日
#3.鉴定人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日   分三种情况 第一鉴定人为第一个人名
def _get_dyjdr(JDQK,jdqk):try:jdqk=jdqk.replace(' ','').replace('  ','').replace('   ','')JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)',jdqk,re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)',jdqk,re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)',jdqk,re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九十]{4}年[[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)',jdqk,re.DOTALL)JDR_str = ''.join(JDR_list)result = psg.cut(JDR_str)JDR = [x.word for x in result if x.flag=='nr']JDR_CD = len(JDR)if JDR_CD >=1:JDQK['DYJDR'] = JDR[0]else:JDQK['DYJDR'] = ''#下面是另一种方法提取鉴定人#     JDQK['BJRQ'] = re.findall('\d{4}' + '年' + '\d{1,2}' + '月' + '\d{1,2}' + '日', jdqk)[-1]#     # 获取鉴定人#     JDR_list=re.findall('(?<=检验人:).*?(?=授权签字人)',jdqk,re.DOTALL) #列表形式存在#     JDR_str=''.join(JDR_list) #转成字符串#     JDR = [i.strip() for i in JDR_str.split('\n') if i.strip()]#     JDRCD = len(JDR)  # 查看检验人的个数#     if JDRCD >= 1:#         JDQK['DYJDR'] = JDR[0]#     else:#         JDQK['DYJDR'] = ''#     if not JDR:#         JDR_list=re.findall('(?<=鉴定人:).*?(?=授权签字人)',jdqk,re.DOTALL)#         JDR_str = ''.join(JDR_list)#         JDR =  [i.strip() for i in JDR_str.split('\n') if i.strip()]#         JDRCD = len(JDR)  # 查看检验人的个数#         if JDRCD >= 1:#             JDQK['DYJDR'] = JDR[0]#         else:#             JDQK['DYJDR'] = ''##     if not JDR:#         re_com = re.compile('检验人:(.*?)\d',re.S)#         re_com1 = re_com.findall(jdqk)[0]#         #去除掉鉴定人之间的空格换行,将其作为列表输出,以便取出第一第二等鉴定人,其他鉴定人。#         JDR =  [i.strip() for i in re_com1.split('\n') if i.strip()]##         JDRCD=len(JDR) #查看检验人的个数#         if JDRCD >=1:#             JDQK['DYJDR'] = JDR[0]#         else:#             JDQK['DYJDR']=''#     if not JDR:#         re_com = re.compile('鉴定人:(.*?)\d',re.S)#         re_com1 = re_com.findall(jdqk)[0]#         # 去除掉鉴定人之间的空格换行,将其作为列表输出,以便取出第一第二等鉴定人,其他鉴定人。#         JDR =  [i.strip() for i in re_com1.split('\n') if i.strip()]##         JDRCD = len(JDR)  # 查看检验人的个数#         if JDRCD >= 1:#             JDQK['DYJDR'] = JDR[0]#         else:#             JDQK['DYJDR'] = ''except Exception as e:if is_debug:print('获取第一鉴定人异常:' + e.__str__())JDQK['DYJDR']=''# 'DRJDR': '',# 第二鉴定人  1.检验人:郑志宏  李东升 2014年2月28日  2.检验人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日#3.鉴定人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日   分三种情况 第二鉴定人为第二个人名
def _get_dejdr(JDQK,jdqk):try:jdqk=jdqk.replace(' ','').replace('  ','').replace('   ','')JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)JDR_str = ''.join(JDR_list)result = psg.cut(JDR_str)JDR = [x.word for x in result if x.flag == 'nr']JDR_CD = len(JDR)if JDR_CD >= 2:JDQK['DRJDR'] = JDR[1]elif JDR_CD < 2:JDQK['DRJDR'] = ''# JDQK['BJRQ'] = re.findall('\d{4}' + '年' + '\d{1,2}' + '月' + '\d{1,2}' + '日', jdqk)[-1]# # 获取鉴定人# # 获取鉴定人# JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)  # 列表形式存在# JDR_str = ''.join(JDR_list) #转成字符串# JDR =  [i.strip() for i in JDR_str.split('\n') if i.strip()]# JDRCD = len(JDR)  # 查看检验人的个数# if JDRCD >= 2:#     JDQK['DRJDR'] = JDR[1]# else:#     JDQK['DRJDR'] = ''# if not JDR:#     JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)#     JDR_str = ''.join(JDR_list)#     JDR = [i.strip() for i in JDR_str.split('\n') if i.strip()]#     JDRCD = len(JDR)  # 查看检验人的个数#     if JDRCD >= 2:#         JDQK['DRJDR'] = JDR[1]#     else:#         JDQK['DRJDR'] = ''# if not JDR:#     re_com = re.compile('检验人:(.*?)\d', re.S)#     re_com1 = re_com.findall(jdqk)[0]#     # 去除掉鉴定人之间的空格换行,将其作为列表输出,以便取出第一第二等鉴定人,其他鉴定人。#     JDR =  [i.strip() for i in re_com1.split('\n') if i.strip()]#     # JDR=re.findall('检验人:(.*?)\d',jdqk,re.S)#     JDRCD = len(JDR)  # 查看检验人的个数#     if JDRCD <= 1:#         JDQK['DRJDR'] = ''#     if JDRCD >=2:#         JDQK['DRJDR'] = JDR[1]## if not JDR:#     re_com = re.compile('鉴定人:(.*?)\d', re.S)#     re_com1 = re_com.findall(jdqk)[0]#     # 去除掉鉴定人之间的空格换行,将其作为列表输出,以便取出第一第二等鉴定人,其他鉴定人。#     JDR =  [i.strip() for i in re_com1.split('\n') if i.strip()]#     # JDR=re.findall('检验人:(.*?)\d',jdqk,re.S)#     JDRCD = len(JDR)  # 查看检验人的个数#     if JDRCD <= 1:#         JDQK['DRJDR'] = ''#     if JDRCD >=2:#         JDQK['DRJDR'] = JDR[1]except Exception as e:if is_debug:print('获取第二鉴定人异常:' + e.__str__())JDQK['DRJDR']=''# 'DSJDR': '',# 第三鉴定人  1.检验人:郑志宏  李东升 2014年2月28日  2.检验人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日
#3.鉴定人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日   分三种情况 第三鉴定人为第三个人名
def _get_dsjdr(JDQK,jdqk):try:  #适用于人名之间没有空格的情况 单个鉴定人之间的数字没有空格 如果存在单名'张 三' 这种 或者 '司法会计 张三\n司法会计 李四'这种就不适用jdqk=jdqk.replace(' ','').replace('  ','').replace('   ','')JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)JDR_str = ''.join(JDR_list)result = psg.cut(JDR_str)JDR = [x.word for x in result if x.flag == 'nr']JDR_CD = len(JDR)if JDR_CD >= 3:JDQK['DSJDR'] = JDR[2]else:JDQK['DSJDR'] = ''except Exception as e:if is_debug:print('获取第三鉴定人异常:' + e.__str__())JDQK['DSJDR']=''# 'QTCYR': '',# 其他参与人  1.检验人:郑志宏  李东升 2014年2月28日  2.检验人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日
#3.鉴定人:苏滨 宋文超 授权签字人:苏滨  2015年9月9日   分三种情况 其他鉴定人为第三个人名后的人名 !!!基本上没有!!!
def _get_qtjdr(JDQK,jdqk):try:jdqk=jdqk.replace(' ','').replace('  ','').replace('   ','')JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)if not JDR_list:JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)if not JDR_list: #有的存在中文和一些不规范的表达 以下代码解决这一问题JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)JDR_str = ''.join(JDR_list)result = psg.cut(JDR_str)JDR = [x.word for x in result if x.flag == 'nr']JDR_CD = len(JDR)if JDR_CD >= 4:JDQK['QTCYR'] = JDR[3:]else:JDQK['QTCYR'] = ''except Exception as e:if is_debug:print('获取其他参与人异常:' + e.__str__())JDQK['QTCYR']=''# 'JYBGFS','JDSFS': '',#检验报告份数,鉴定书份数
def _get_jybgfs_jdsfs(JDQK,jdqk):try:#获取检验报告JYBG = re.findall('(?:检验报告).*?(?=委托单位)',jdqk,re.S)if '检验报告' in JYBG[0]:JDQK['JYBGFS'] = 1JDQK['JDSFS'] = 0if '鉴定书' in JYBG[0]:JDQK['JYBGFS'] = 0JDQK['JDSFS'] = 1except Exception as e:if is_debug:print('获取检验报告和鉴定书异常:' + e.__str__())JDQK['JYBGFS']=''JDQK['JDSFS']=''# 'WPZJSL': '',#外聘专家数量
def _get_wpzsl(JDQK,jdqk):try:#获取鉴定书WPZJ = re.findall('(?=外聘专家数量)',jdqk)if WPZJ != []:passelse:JDQK['WPZJSL'] = 0except Exception as e:if is_debug:print('获取外聘专家数量异常:' + e.__str__())JDQK['WPZJSL']=''
#***************************鉴定情况END**************************************# 读取文件解析路径
def readoc(path, client=None):suffix = os.path.splitext(path)[-1]wbxx = ''if suffix == '.txt':try:with open(path, 'r', encoding='utf-8') as f:wbxx = f.read()except:try:with open(path, 'r', encoding='gb2312') as f:wbxx = f.read()except:try:with open(path, 'r', encoding='gbk') as f:wbxx = f.read()except:with open(path, 'r', encoding='gb18030') as f:wbxx = f.read()elif suffix == '.docx':document = docx.Document(path)for paragraph in document.paragraphs:wbxx += paragraph.textwbxx += '\n'elif suffix in ['.doc', '.docx', '.htm', '.rtf', '.wps']:word = client.Dispatch('Word.Application')word.visible = 0word.displayalerts = 0try:doc = word.Documents.Open(path)i = 0for paragraphs in doc.Paragraphs:i += 1wbxx += paragraphs.Range.Text.replace('\r', '\n')doc.Close()except:try:doc.Close()except:passreturn 'word读取出错'word.Quit()else:return '未识别格式'return wbxx# 读取文件功能入口
def read_sys(func):# 是否为调式模式,生产环境设为False,调式环境设为Trueglobal is_debugis_debug =False# 设置默认读取解析文件isfiledir = False# 设置默认读取文件路径f_path = r''if len(sys.argv) == 2:f_path = sys.argv[-1]else:# 单个文书f_path = r'D:\wb文案测试\检察技术NLP\技术检察检验报告\胶检技鉴受[2019]37028100002号_检验报告_952B014000A60038E053C0A8014875AE.doc'if is_debug:print(f_path)# True Falseisfiledir = Falsepass# 遍历读取某个文件夹下的所有文件if isfiledir:# 统计文书解析数量i = 0for f in os.scandir(f_path):if is_debug:# 测试用,正式注释掉 startif i < 1:i = i + 1continue# 测试用,正式注释掉 endf_path = f.pathif is_debug:# 输出文件路径print(f_path)if os.path.isfile(r'' + f_path.replace('\"', '')):if sys.platform != 'linux' and os.path.splitext(f_path)[-1] != '.txt':from win32com import clientelse:client = Nonecontent = readoc(os.path.normpath(r'' + f_path.replace('\"', '')), client)elif isinstance(content, str):passelse:print('未识别文件格式!!!')if len(sys.argv) == 1:pass# content = reg_text(content)extract_result = func(content)print(extract_result)i = i + 1if is_debug:# 批量测试脚本时使用,每次执行100份,测试用,正式注释掉 startif i == 200:break# 批量测试脚本时使用,每次执行100份,测试用,正式注释掉 startelse:if os.path.isfile(r''+f_path.replace('\"', '')):if sys.platform != 'linux' and os.path.splitext(f_path)[-1] != '.txt':from win32com import clientelse:client = Nonecontent = readoc(os.path.normpath(r''+f_path.replace('\"', '')), client)elif isinstance(f_path, str):passelse:print('未识别文件格式!!!')if len(sys.argv) == 1:pass# content = reg_text(content)extract_result = func(content)print(extract_result)# 主入口程序
if __name__ == '__main__':read_sys(main)
*斜体样式*
在获取姓名的时候用到jieba分词和词性标注进行获取
多文本处理的也有很多的内容,在这里也说不完,可以具体在代码中看,欢迎大神们指导,本人随时更新和修改。

更多推荐

python中正则表达式与jieba分词的使用

本文发布于:2023-07-03 03:12:23,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1000425.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:分词   正则表达式   python   jieba

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!