test008

yzm284932

##本程序可以遍历本地制定目录，并且将其文本内容上传的SQLITE3的数据库

##数据库字段包含ID,FILENAME,DT, TP, PATH, TM, SIZE, CONTENT

## ID,文件名，文件修改时间，文件后缀名，文件路径，花费时间，提取的字符数，文本内容

##

##DB_FILE是数据库文件名，表名建议 FILES

##ON_DELETED函数是解决另存文件为先删除、再创建的问题

##ON_MODIFIED、ON_CREATED是对应事件时处理文件

##

##本代码可以支持处理PPTX、XLSX文件，如有需要可以继续增加 MSG文件

##

##SQLITE3的 TEXT类型是没有长度限制的。

import sys, os, time, datetime, traceback, ipdb

import sqlite3

from multiprocessing import *

from pptx import Presentation

import openpyxl

file_cnt=0

cnt = 0

DB_FILE = r'D:\FILE_007.DB'

chk_Dir=r"e:\\"

start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

file_no=['START','START','\n']

def file_handle(lock1,lock2,lock3,lock4,lock5,

file_q1,file_q2,file_q3,file_q4,file_q5,

lk1,lk2,lk3,lk4,lk5,

content_q1,content_q2,content_q3,content_q4,content_q5):

pro = current_process().name

p_num = pro[-1]

exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')

sys.stdout.flush()

print(pro, '............', f1)

while f1 != 'NA':

#print(f1,ext)

content = ''

file = os.path.basename(f1)

ext = os.path.splitext(file)[-1]

ext = ext.upper()

if file.find('~$') == -1 and ext == '.XLSX':

DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))

#with lock2: new_q.put(f1)

tm1=time.time()

tp='XLSX'

#file_cnt+=1

print(pro, "...发现修改文件:", f1)

# 解析xlxs文件内容

# 打开Excel文件

try:

workbook = openpyxl.load_workbook(r'{}'.format(f1))

except:

time.sleep(3)

try:

workbook = openpyxl.load_workbook(r'{}'.format(f1))

except:

sys.stdout.flush()

print(Exception)

err=traceback.format_exc()

#if file_1 > '': return

exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')

continue

# 获取所有非空白的工作表名

sheet_names = [name for name in workbook.sheetnames

if workbook[name].max_column > 0 and workbook[name].max_row > 0]

# 遍历每个工作表

for sheet_name in sheet_names:

# 获取工作表对象

sheet = workbook[sheet_name]

# 获取最大行数和列数

max_row = sheet.max_row

max_column = sheet.max_column

# 只取前100行的内容

if max_row > 30:

max_row = 30

if max_column > 20:

max_column = 20

# 遍历每行和每列，获取单元格内容

content += ' SN:{}:'.format(sheet_name)

for row in range(1, max_row+1):

for column in range(1, max_column+1):

# 获取单元格内容

value=sheet.cell(row=row, column=column).value

if value is None:

continue

if str(value).find('=')>-1:

continue

if str(value).isnumeric():

continue

content = content + ',' + str(value)

# 插入记录到数据库

#print(content)

workbook.close()

tm2=time.time()

tm3=str(round(tm2-tm1,3))

exec(f'with lk{p_num}: content_q{p_num}.put((DT_TM,tp,file, f1,tm3, content, pro)')

#if file_1 > '': return

exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')

# 解析 PPTX 文件内容######################################################

if file.find('~$') == -1 and ext == '.PPTX':

DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))

#with lock2: new_q.put(f1)

tm1=time.time()

tp='PPTX'

sys.stdout.flush()

print(pro, "...发现修改文件:", f1)

# 解析pptx文件内容

try:

prs = Presentation(f1)

except:

time.sleep(3)

try:

prs = Presentation(f1)

except:

print(Exception)

err=traceback.format_exc()

#if file_1 > '': return

exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')

continue

content = '\n'.join(paragraph.text for slide in prs.slides for shape in slide.shapes if shape.has_text_frame for paragraph in shape.text_frame.paragraphs)

# 插入记录到数据库

#print(content)

tm2=time.time()

tm3=str(round(tm2-tm1,3))

exec(f'with lk{p_num}: content_q{p_num}.put((DT_TM,tp,file, f1,tm3, content, pro)')

#if file_1 > '': return

exec(f'with lock{p_num}: f1 = file_q{p_num}.get()')

return

if __name__ == '__main__':

tm00 = time.time()

temp_q = []

for i in range(1,6):

exec(f'file_q{i} = Queue()')

exec(f'lock{i} = Semaphore(7)')

exec(f'content_q{i} = Queue()')

exec(f'lk{i} = Semaphore(7)')

exec(f'file_q{i} = Queue()')

for i in range(1,2):

exec(f'p{i} = Process(target=file_handle, \

args=(lock1,lock2,lock3,lock4,lock5, \

file_q1,file_q2,file_q3,file_q4,file_q5, \

lk1,lk2,lk3,lk4,lk5,content_q1, \

content_q2,content_q3,content_q4,content_q5))')

exec(f'p{i}.start()')

# 定义数据库表结构和连接

conn = sqlite3.connect(DB_FILE,check_same_thread=False)

c = conn.cursor()

c.execute('''

CREATE TABLE IF NOT EXISTS FILES (

id INTEGER PRIMARY KEY AUTOINCREMENT,

filename TEXT,

dt TEXT,

TP TEXT,

path TEXT,

tm TEXT,

SIZE INTEGER,

content TEXT,

PRO TEXT

)

''')

c.execute('select path from files')

back = c.fetchall()

file_hist = set( list(r[0] for r in back) )

for root, dirs, files in os.walk(chk_Dir,topdown=False):

if root.upper().find("Welink_Files")>-1:

continue

for file in files:

f1 = os.path.join(root, file)

#f1 = f1.replace('\\._','\\')

ext = os.path.splitext(file)[-1]

ext = ext.upper()

if f1 in file_hist:

print('Exist............')

continue

try: size=int(os.path.getsize(f1)/1000/1000)

except: size = 100

if size > 3 and file.find('~$') == -1 and ext in ( '.XLSX'):

file_no.append(f1+'\n文件过大 '+str(size)+' M,跳过......\n')

continue

# 判断文件类型，总流程等 ######################################################

if file.find('~$') == -1 and ext in ( '.XLSX', '.PPTX'):

# 将中转列表写满 10 个 #####################################################

if len(temp_q) < 100: temp_q.append(f1)

if len(temp_q) == 100:

for n in range(1,6):

exec(f'kkk = lock{n}')

with kkk:

for i in range(20):

exec(f'file_q{n}.put(temp_q.pop())')

while file_q1.qsize() > 10:

time.sleep(1)

print(len(active_children()), file_q1.qsize(), len(temp_q), content_q1.qsize(), active_children())

## if content_q.qsize() > 15:

## with lock3:

## for i in range(content_q.qsize()):

## c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', content_q.get())

## print('*******************************')

## conn.commit()

print(file_q1.qsize(), len(temp_q), content_q1.qsize())

with lock1:

for i in range(len(temp_q)):

file_q1.put(temp_q.get())

for i in range(10):

file_q1.put('NA')

while len(active_children()) > 0 and file_q1.qsize() > 3:

time.sleep(0.2)

print(len(active_children()), content_q1.qsize(), file_q1.qsize())

## with lock3:

## for i in range(content_q.qsize()):

## c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', content_q.get())

## print('*******************************')

## conn.commit()

print(file_q.qsize(), temp_q.qsize(), content_q.qsize())

## while content_q.empty() is False:

## for i in range(content_q.qsize()):

## c.execute('INSERT INTO files (dt,tp,filename, path, tm,content) VALUES (? ,?,?, ?,?, ?)', content_q.get())

## conn.commit()

## end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

## with open(r'd:\res01.txt', 'a', encoding='utf-8') as f:

## f.writelines(file_no)

## f.writelines('\n'+start_time+'\n')

## f.writelines(end_time+'\n')

conn.close()

## print(file_hist - file_now)

print('end.....................................')

print(time.time()-tm00)

0回帖数	1关注人数	715浏览人数
最后回复于：2023-6-6 20:02

提问

test008