備忘錄_20160105(定位)
修改
回首頁
程式 2019-06-29 02:30:18 1561746618 100
python + tika 搜尋 doc, docx 內的文字
python + tika 搜尋 doc, docx 內的文字
apache tika 先執行起來
# -*- coding: utf-8 -*-
# 試試看
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import codecs
import os
from os import walk
import tika
tika.TikaClientOnly = True
from tika import parser
counter=0
file2 = codecs.open("list_of_docs.txt", "w", "utf-8")
for (dirpath, dirnames, filenames) in walk(u"d:\\"):
for item in filenames:
fp=os.path.join(dirpath,item)
fp_name, fp_extension = os.path.splitext(fp)
if fp.find("\\$RECYCLE.BIN\\") == -1:
if fp_extension.lower()=='.doc' or fp_extension.lower()=='.docx':
counter+=1
file_data = parser.from_file(fp)
# print("debug...."+fp)
text = file_data['content']
if text is None:
print("None")
else:
if len(text)>0:
print(str(counter)+' has data, len='+str(len(text)))
if text.find(u"嘉哲") != -1:
file2.write(fp+"\r\n")
# print("found:"+fp);
print("found!");
file2.close()