##本程序可以遍历本地制定目录,并且将其文本内容上传的SQLITE3的数据库
##数据库字段包含ID,FILENAME,DT, TP, PATH, TM, SIZE, CONTENT
## ID,文件名,文件修改时间,文件后缀名,文件路径,花费时间,提取的字符数,文本内容
##
##DB_FILE是数据库文件名,表名建议 FILES
##ON_DELETED函数是解决另存文件为先删除、再创建的问题
##ON_MODIFIED、ON_CREATED是对应事件时处理文件
##
##本代码可以支持处理PPTX、XLSX文件,如有需要可以继续增加 MSG文件
##
##SQLITE3的 TEXT类型是没有长度限制的。
import sys, os, time, datetime, traceback, ipdb, gc
sys.path.append
import sqlite3
from multiprocessing import *
from pptx import Presentation
import openpyxl
file_cnt=0
cnt = 0
DB_FILE = r'D:\FILE_006.DB'
chk_Dir=r"H:\\"
start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
file_no=['START','START','\n']
def file_handle(lock1, file_q, lock2, new_q, lock3, content_q):
pro = current_process().name
sub_cnt = 0
with lock1: f1 = file_q.get()
while f1 != 'NA' and sub_cnt < 500:
#print(f1,ext)
content = ''
file = os.path.basename(f1)
ext = os.path.splitext(file)[-1]
ext = ext.upper()
if file.find('~$') == -1 and ext == '.XLSX':
DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))
#with lock2: new_q.put(f1)
tm1=time.time()
tp='XLSX'
#file_cnt+=1
print(pro,sub_cnt, "...发现修改文件:", f1)
# 解析xlxs文件内容
# 打开Excel文件
try:
workbook = openpyxl.load_workbook(r'{}'.format(f1))
except:
time.sleep(3)
try:
workbook = openpyxl.load_workbook(r'{}'.format(f1))
except:
sys.stdout.flush()
print(Exception)
err=traceback.format_exc()
with lock1: f1 = file_q.get()
continue
# 获取所有非空白的工作表名
sheet_names = [name for name in workbook.sheetnames
if workbook[name].max_column > 0 and workbook[name].max_row > 0]
# 遍历每个工作表
for sheet_name in sheet_names:
# 获取工作表对象
sheet = workbook[sheet_name]
# 获取最大行数和列数
max_row = sheet.max_row
max_column = sheet.max_column
# 只取前100行的内容
if max_row > 30:
max_row = 30
if max_column > 20:
max_column = 20
# 遍历每行和每列,获取单元格内容
content += ' SN:{}:'.format(sheet_name)
for row in range(1, max_row+1):
for column in range(1, max_column+1):
# 获取单元格内容
value=sheet.cell(row=row, column=column).value
if value is None:
continue
if str(value).find('=')>-1:
continue
if str(value).isnumeric():
continue
content = content + ',' + str(value)
# 插入记录到数据库
#print(content)
workbook.close()
tm2=time.time()
tm3=str(round(tm2-tm1,3))
with lock3: content_q.put((DT_TM,tp,file, f1,tm3, content, pro))
#workbook.close()
time.sleep(0.02)
#if file_1 > '': return
with lock1: f1 = file_q.get()
sub_cnt += 1
try: workbook.close()
except: pass
time.sleep(0.02)
# 解析 PPTX 文件内容######################################################
if file.find('~$') == -1 and ext == '.PPTX':
DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))
#with lock2: new_q.put(f1)
tm1=time.time()
tp='PPTX'
sys.stdout.flush()
print(pro,sub_cnt, "...发现修改文件:", f1)
# 解析pptx文件内容
try:
prs = Presentation(f1)
except:
time.sleep(3)
try:
prs = Presentation(f1)
except:
print(Exception)
err=traceback.format_exc()
#if file_1 > '': return
with lock1: f1 = file_q.get()
continue
content = '\n'.join(paragraph.text for slide in prs.slides for shape in slide.shapes if shape.has_text_frame for paragraph in shape.text_frame.paragraphs)
# 插入记录到数据库
#print(content)
tm2=time.time()
tm3=str(round(tm2-tm1,3))
with lock3: content_q.put((DT_TM,tp,file, f1,tm3, content, pro))
#prs.close()
time.sleep(0.02)
#if file_1 > '': return
with lock1: f1 = file_q.get()
sub_cnt += 1
## sys.stdout.flush()
## print(current_process().name, '***************', file_q.qsize(), f1)
try: prs.close()
except: pass
time.sleep(0.02)
return
if __name__ == '__main__':
tm00 = time.time()
temp_q = []
ppp = {}
file_q, new_q, content_q = Queue(), Queue(), Queue()
lock1, lock2, lock3 = Semaphore(6), Semaphore(6), Semaphore(6)
for i in range(1,6):
exec(f"p{i} = Process(target=file_handle, args=(lock1, file_q, lock2, new_q, lock3, content_q), name='p{i}')")
exec(f"p{i}.start()")
## p1.start()
## p2.start()
## p3.start()
## p4.start()
# 定义数据库表结构和连接
conn = sqlite3.connect(DB_FILE,check_same_thread=False)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS FILES (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT,
dt TEXT,
TP TEXT,
path TEXT,
tm TEXT,
SIZE INTEGER,
content TEXT,
PRO TEXT
)
''')
c.execute('select path from files')
back = c.fetchall()
file_hist = set( list(r[0] for r in back) )
for root, dirs, files in os.walk(chk_Dir,topdown=False):
if root.upper().find("Welink_Files")>-1:
continue
for file in files:
f1 = os.path.join(root, file)
#f1 = f1.replace('\\._','\\')
ext = os.path.splitext(file)[-1]
ext = ext.upper()
if f1 in file_hist:
print('Exist............')
continue
try: size=int(os.path.getsize(f1)/1000/1000)
except: size = 100
if size > 3 and file.find('~$') == -1 and ext in ( '.XLSX'):
file_no.append(f1+'\n文件过大 '+str(size)+' M,跳过......\n')
continue
# 解析 XLSX 文件内容######################################################
if file.find('~$') == -1 and ext in ( '.XLSX', '.PPTX'):
if len(temp_q) < 30: temp_q.append(f1)
if len(temp_q) == 30:
with lock1:
for i in range(len(temp_q)):
file_q.put(temp_q.pop())
if content_q.qsize() > 50:
gc.collect()
with lock3:
for i in range(content_q.qsize()):
yuan = content_q.get()
ppp[yuan[-1]] = 1 + ppp[yuan[-1]] if yuan[-1] in ppp else 0
c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', yuan)
print('*******************************')
conn.commit()
while file_q.qsize() > 30:
print(ppp)
print(len(active_children()), file_q.qsize(), len(temp_q), content_q.qsize(), '......', lock1.get_value(), lock1.get_value())
time.sleep(1)
if len(active_children()) < 5:
gc.collect()
p = Process(target=file_handle, args=(lock1, file_q, lock2, new_q, lock3, content_q))
p.start()
break
for i in range(len(temp_q)):
file_q.put(temp_q.pop())
for i in range(6):
file_q.put('NA')
while len(active_children()) > 0 and file_q.qsize() > 3:
time.sleep(0.2)
print(len(active_children()), content_q.qsize(), file_q.qsize(), len(temp_q))
with lock3:
for i in range(content_q.qsize()):
c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', content_q.get())
print('*******************************')
conn.commit()
print(file_q.qsize(), content_q.qsize(), len(temp_q))
end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
with open(r'd:\res01.txt', 'a', encoding='utf-8') as f:
f.writelines(str(ppp)+'\n\n')
f.writelines('\n'+start_time+'\n')
f.writelines(end_time+'\n')
conn.close()
print('end.....................................')
print(time.time()-tm00)
|