##本程序可以遍历本地制定目录,并且将其文本内容上传的SQLITE3的数据库
##数据库字段包含ID,FILENAME,DT, TP, PATH, TM, SIZE, CONTENT
## ID,文件名,文件修改时间,文件后缀名,文件路径,花费时间,提取的字符数,文本内容
##
##DB_FILE是数据库文件名,表名建议 FILES
##ON_DELETED函数是解决另存文件为先删除、再创建的问题
##ON_MODIFIED、ON_CREATED是对应事件时处理文件
##
##本代码可以支持处理PPTX、XLSX文件,如有需要可以继续增加 MSG文件
##
##SQLITE3的 TEXT类型是没有长度限制的。
import sys, os, time, datetime, traceback, ipdb
import sqlite3
from multiprocessing import *
from pptx import Presentation
import openpyxl
file_cnt=0
cnt = 0
DB_FILE = r'D:\FILE_007.DB'
chk_Dir=r"e:\\"
start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
file_no=['START','START','\n']
def file_handle(lock1,lock2,lock3,lock4,lock5,
file_q1,file_q2,file_q3,file_q4,file_q5,
lk1,lk2,lk3,lk4,lk5,
content_q1,content_q2,content_q3,content_q4,content_q5):
pro = current_process().name
p_num = pro[-1]
exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')
sys.stdout.flush()
print(pro, '............', f1)
while f1 != 'NA':
#print(f1,ext)
content = ''
file = os.path.basename(f1)
ext = os.path.splitext(file)[-1]
ext = ext.upper()
if file.find('~$') == -1 and ext == '.XLSX':
DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))
#with lock2: new_q.put(f1)
tm1=time.time()
tp='XLSX'
#file_cnt+=1
print(pro, "...发现修改文件:", f1)
# 解析xlxs文件内容
# 打开Excel文件
try:
workbook = openpyxl.load_workbook(r'{}'.format(f1))
except:
time.sleep(3)
try:
workbook = openpyxl.load_workbook(r'{}'.format(f1))
except:
sys.stdout.flush()
print(Exception)
err=traceback.format_exc()
#if file_1 > '': return
exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')
continue
# 获取所有非空白的工作表名
sheet_names = [name for name in workbook.sheetnames
if workbook[name].max_column > 0 and workbook[name].max_row > 0]
# 遍历每个工作表
for sheet_name in sheet_names:
# 获取工作表对象
sheet = workbook[sheet_name]
# 获取最大行数和列数
max_row = sheet.max_row
max_column = sheet.max_column
# 只取前100行的内容
if max_row > 30:
max_row = 30
if max_column > 20:
max_column = 20
# 遍历每行和每列,获取单元格内容
content += ' SN:{}:'.format(sheet_name)
for row in range(1, max_row+1):
for column in range(1, max_column+1):
# 获取单元格内容
value=sheet.cell(row=row, column=column).value
if value is None:
continue
if str(value).find('=')>-1:
continue
if str(value).isnumeric():
continue
content = content + ',' + str(value)
# 插入记录到数据库
#print(content)
workbook.close()
tm2=time.time()
tm3=str(round(tm2-tm1,3))
exec(f'with lk{p_num}: content_q{p_num}.put((DT_TM,tp,file, f1,tm3, content, pro)')
#if file_1 > '': return
exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')
# 解析 PPTX 文件内容######################################################
if file.find('~$') == -1 and ext == '.PPTX':
DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))
#with lock2: new_q.put(f1)
tm1=time.time()
tp='PPTX'
sys.stdout.flush()
print(pro, "...发现修改文件:", f1)
# 解析pptx文件内容
try:
prs = Presentation(f1)
except:
time.sleep(3)
try:
prs = Presentation(f1)
except:
print(Exception)
err=traceback.format_exc()
#if file_1 > '': return
exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')
continue
content = '\n'.join(paragraph.text for slide in prs.slides for shape in slide.shapes if shape.has_text_frame for paragraph in shape.text_frame.paragraphs)
# 插入记录到数据库
#print(content)
tm2=time.time()
tm3=str(round(tm2-tm1,3))
exec(f'with lk{p_num}: content_q{p_num}.put((DT_TM,tp,file, f1,tm3, content, pro)')
#if file_1 > '': return
exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')
return
if __name__ == '__main__':
tm00 = time.time()
temp_q = []
for i in range(1,6):
exec(f'file_q{i} = Queue()')
exec(f'lock{i} = Semaphore(7)')
exec(f'content_q{i} = Queue()')
exec(f'lk{i} = Semaphore(7)')
exec(f'file_q{i} = Queue()')
for i in range(1,2):
exec(f'p{i} = Process(target=file_handle, \
args=(lock1,lock2,lock3,lock4,lock5, \
file_q1,file_q2,file_q3,file_q4,file_q5, \
lk1,lk2,lk3,lk4,lk5,content_q1, \
content_q2,content_q3,content_q4,content_q5))')
exec(f'p{i}.start()')
# 定义数据库表结构和连接
conn = sqlite3.connect(DB_FILE,check_same_thread=False)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS FILES (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT,
dt TEXT,
TP TEXT,
path TEXT,
tm TEXT,
SIZE INTEGER,
content TEXT,
PRO TEXT
)
''')
c.execute('select path from files')
back = c.fetchall()
file_hist = set( list(r[0] for r in back) )
for root, dirs, files in os.walk(chk_Dir,topdown=False):
if root.upper().find("Welink_Files")>-1:
continue
for file in files:
f1 = os.path.join(root, file)
#f1 = f1.replace('\\._','\\')
ext = os.path.splitext(file)[-1]
ext = ext.upper()
if f1 in file_hist:
print('Exist............')
continue
try: size=int(os.path.getsize(f1)/1000/1000)
except: size = 100
if size > 3 and file.find('~$') == -1 and ext in ( '.XLSX'):
file_no.append(f1+'\n文件过大 '+str(size)+' M,跳过......\n')
continue
# 判断文件类型,总流程等 ######################################################
if file.find('~$') == -1 and ext in ( '.XLSX', '.PPTX'):
# 将中转列表写满 10 个 #####################################################
if len(temp_q) < 100: temp_q.append(f1)
if len(temp_q) == 100:
for n in range(1,6):
exec(f'kkk = lock{n}')
with kkk:
for i in range(20):
exec(f'file_q{n}.put(temp_q.pop())')
while file_q1.qsize() > 10:
time.sleep(1)
print(len(active_children()), file_q1.qsize(), len(temp_q), content_q1.qsize(), active_children())
## if content_q.qsize() > 15:
## with lock3:
## for i in range(content_q.qsize()):
## c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', content_q.get())
## print('*******************************')
## conn.commit()
print(file_q1.qsize(), len(temp_q), content_q1.qsize())
with lock1:
for i in range(len(temp_q)):
file_q1.put(temp_q.get())
for i in range(10):
file_q1.put('NA')
while len(active_children()) > 0 and file_q1.qsize() > 3:
time.sleep(0.2)
print(len(active_children()), content_q1.qsize(), file_q1.qsize())
## with lock3:
## for i in range(content_q.qsize()):
## c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', content_q.get())
## print('*******************************')
## conn.commit()
print(file_q.qsize(), temp_q.qsize(), content_q.qsize())
## while content_q.empty() is False:
## for i in range(content_q.qsize()):
## c.execute('INSERT INTO files (dt,tp,filename, path, tm,content) VALUES (? ,?,?, ?,?, ?)', content_q.get())
## conn.commit()
## end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
## with open(r'd:\res01.txt', 'a', encoding='utf-8') as f:
## f.writelines(file_no)
## f.writelines('\n'+start_time+'\n')
## f.writelines(end_time+'\n')
conn.close()
## print(file_hist - file_now)
print('end.....................................')
print(time.time()-tm00)
|