TEST09

yzm284932

##本程序可以遍历本地制定目录，并且将其文本内容上传的SQLITE3的数据库

##数据库字段包含ID,FILENAME,DT, TP, PATH, TM, SIZE, CONTENT

## ID,文件名，文件修改时间，文件后缀名，文件路径，花费时间，提取的字符数，文本内容

##

##DB_FILE是数据库文件名，表名建议 FILES

##ON_DELETED函数是解决另存文件为先删除、再创建的问题

##ON_MODIFIED、ON_CREATED是对应事件时处理文件

##

##本代码可以支持处理PPTX、XLSX文件，如有需要可以继续增加 MSG文件

##

##SQLITE3的 TEXT类型是没有长度限制的。

import sys, os, time, datetime, traceback, ipdb, gc

sys.path.append

import sqlite3

from multiprocessing import *

from pptx import Presentation

import openpyxl

file_cnt=0

cnt = 0

DB_FILE = r'D:\FILE_006.DB'

chk_Dir=r"H:\\"

start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

file_no=['START','START','\n']

def file_handle(lock1, file_q, lock2, new_q, lock3, content_q):

pro = current_process().name

sub_cnt = 0

with lock1: f1 = file_q.get()

while f1 != 'NA' and sub_cnt < 500:

#print(f1,ext)

content = ''

file = os.path.basename(f1)

ext = os.path.splitext(file)[-1]

ext = ext.upper()

if file.find('~$') == -1 and ext == '.XLSX':

DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))

#with lock2: new_q.put(f1)

tm1=time.time()

tp='XLSX'

#file_cnt+=1

print(pro,sub_cnt, "...发现修改文件:", f1)

# 解析xlxs文件内容

# 打开Excel文件

try:

workbook = openpyxl.load_workbook(r'{}'.format(f1))

except:

time.sleep(3)

try:

workbook = openpyxl.load_workbook(r'{}'.format(f1))

except:

sys.stdout.flush()

print(Exception)

err=traceback.format_exc()

with lock1: f1 = file_q.get()

continue

# 获取所有非空白的工作表名

sheet_names = [name for name in workbook.sheetnames

if workbook[name].max_column > 0 and workbook[name].max_row > 0]

# 遍历每个工作表

for sheet_name in sheet_names:

# 获取工作表对象

sheet = workbook[sheet_name]

# 获取最大行数和列数

max_row = sheet.max_row

max_column = sheet.max_column

# 只取前100行的内容

if max_row > 30:

max_row = 30

if max_column > 20:

max_column = 20

# 遍历每行和每列，获取单元格内容

content += ' SN:{}:'.format(sheet_name)

for row in range(1, max_row+1):

for column in range(1, max_column+1):

# 获取单元格内容

value=sheet.cell(row=row, column=column).value

if value is None:

continue

if str(value).find('=')>-1:

continue

if str(value).isnumeric():

continue

content = content + ',' + str(value)

# 插入记录到数据库

#print(content)

workbook.close()

tm2=time.time()

tm3=str(round(tm2-tm1,3))

with lock3: content_q.put((DT_TM,tp,file, f1,tm3, content, pro))

#workbook.close()

time.sleep(0.02)

#if file_1 > '': return

with lock1: f1 = file_q.get()

sub_cnt += 1

try: workbook.close()

except: pass

time.sleep(0.02)

# 解析 PPTX 文件内容######################################################

if file.find('~$') == -1 and ext == '.PPTX':

DT_TM= time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime(os.path.getmtime(f1)))

#with lock2: new_q.put(f1)

tm1=time.time()

tp='PPTX'

sys.stdout.flush()

print(pro,sub_cnt, "...发现修改文件:", f1)

# 解析pptx文件内容

try:

prs = Presentation(f1)

except:

time.sleep(3)

try:

prs = Presentation(f1)

except:

print(Exception)

err=traceback.format_exc()

#if file_1 > '': return

with lock1: f1 = file_q.get()

continue

content = '\n'.join(paragraph.text for slide in prs.slides for shape in slide.shapes if shape.has_text_frame for paragraph in shape.text_frame.paragraphs)

# 插入记录到数据库

#print(content)

tm2=time.time()

tm3=str(round(tm2-tm1,3))

with lock3: content_q.put((DT_TM,tp,file, f1,tm3, content, pro))

#prs.close()

time.sleep(0.02)

#if file_1 > '': return

with lock1: f1 = file_q.get()

sub_cnt += 1

## sys.stdout.flush()

## print(current_process().name, '***************', file_q.qsize(), f1)

try: prs.close()

except: pass

time.sleep(0.02)

return

if __name__ == '__main__':

tm00 = time.time()

temp_q = []

ppp = {}

file_q, new_q, content_q = Queue(), Queue(), Queue()

lock1, lock2, lock3 = Semaphore(6), Semaphore(6), Semaphore(6)

for i in range(1,6):

exec(f"p{i} = Process(target=file_handle, args=(lock1, file_q, lock2, new_q, lock3, content_q), name='p{i}')")

exec(f"p{i}.start()")

## p1.start()

## p2.start()

## p3.start()

## p4.start()

# 定义数据库表结构和连接

conn = sqlite3.connect(DB_FILE,check_same_thread=False)

c = conn.cursor()

c.execute('''

CREATE TABLE IF NOT EXISTS FILES (

id INTEGER PRIMARY KEY AUTOINCREMENT,

filename TEXT,

dt TEXT,

TP TEXT,

path TEXT,

tm TEXT,

SIZE INTEGER,

content TEXT,

PRO TEXT

)

''')

c.execute('select path from files')

back = c.fetchall()

file_hist = set( list(r[0] for r in back) )

for root, dirs, files in os.walk(chk_Dir,topdown=False):

if root.upper().find("Welink_Files")>-1:

continue

for file in files:

f1 = os.path.join(root, file)

#f1 = f1.replace('\\._','\\')

ext = os.path.splitext(file)[-1]

ext = ext.upper()

if f1 in file_hist:

print('Exist............')

continue

try: size=int(os.path.getsize(f1)/1000/1000)

except: size = 100

if size > 3 and file.find('~$') == -1 and ext in ( '.XLSX'):

file_no.append(f1+'\n文件过大 '+str(size)+' M,跳过......\n')

continue

# 解析 XLSX 文件内容######################################################

if file.find('~$') == -1 and ext in ( '.XLSX', '.PPTX'):

if len(temp_q) < 30: temp_q.append(f1)

if len(temp_q) == 30:

with lock1:

for i in range(len(temp_q)):

file_q.put(temp_q.pop())

if content_q.qsize() > 50:

gc.collect()

with lock3:

for i in range(content_q.qsize()):

yuan = content_q.get()

ppp[yuan[-1]] = 1 + ppp[yuan[-1]] if yuan[-1] in ppp else 0

c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', yuan)

print('*******************************')

conn.commit()

while file_q.qsize() > 30:

print(ppp)

print(len(active_children()), file_q.qsize(), len(temp_q), content_q.qsize(), '......', lock1.get_value(), lock1.get_value())

time.sleep(1)

if len(active_children()) < 5:

gc.collect()

p = Process(target=file_handle, args=(lock1, file_q, lock2, new_q, lock3, content_q))

p.start()

break

for i in range(len(temp_q)):

file_q.put(temp_q.pop())

for i in range(6):

file_q.put('NA')

while len(active_children()) > 0 and file_q.qsize() > 3:

time.sleep(0.2)

print(len(active_children()), content_q.qsize(), file_q.qsize(), len(temp_q))

with lock3:

for i in range(content_q.qsize()):

c.execute('INSERT INTO files (dt,tp,filename, path, tm,content, pro) VALUES (?,?,?,?,?,?,?)', content_q.get())

print('*******************************')

conn.commit()

print(file_q.qsize(), content_q.qsize(), len(temp_q))

end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

with open(r'd:\res01.txt', 'a', encoding='utf-8') as f:

f.writelines(str(ppp)+'\n\n')

f.writelines('\n'+start_time+'\n')

f.writelines(end_time+'\n')

conn.close()

print('end.....................................')

print(time.time()-tm00)

0回帖数	1关注人数	759浏览人数
最后回复于：2023-6-8 11:38

提问

TEST09