備忘錄_20160105(定位) 修改 回首頁

程式 2019-06-29 02:30:18 1561746618 100
python + tika 搜尋 doc, docx 內的文字

python + tika 搜尋 doc, docx 內的文字

apache tika 先執行起來

# -*- coding: utf-8 -*-

# 試試看

import sys
reload(sys)  
sys.setdefaultencoding('utf-8')

import codecs
import os

from os import walk

import tika
tika.TikaClientOnly = True
from tika import parser

counter=0
file2 = codecs.open("list_of_docs.txt", "w", "utf-8")
for (dirpath, dirnames, filenames) in walk(u"d:\\"):
  for item in filenames:
    fp=os.path.join(dirpath,item)
    fp_name, fp_extension = os.path.splitext(fp)
    if fp.find("\\$RECYCLE.BIN\\") == -1:
      if fp_extension.lower()=='.doc' or fp_extension.lower()=='.docx':
        counter+=1
        file_data = parser.from_file(fp)
        # print("debug...."+fp)
        text = file_data['content']
        if text is None:
          print("None")
        else:
          if len(text)>0:
            print(str(counter)+' has data, len='+str(len(text)))
            if text.find(u"嘉哲") != -1:
              file2.write(fp+"\r\n")
              # print("found:"+fp);
              print("found!");

file2.close()