1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
| package com.yhd.test.poi;
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Date;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class LuceneCreateIndex {
public static void main(String[] args) throws IOException { String dataDirectory = "D:\\Studying\\poi\\test\\dataDirectory"; String indexDirectory = "D:\\Studying\\poi\\test\\indexDirectory"; Directory directory = new SimpleFSDirectory(new File(indexDirectory)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
IndexWriter indexWriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); File[] files = new File(dataDirectory).listFiles();
for (int i = 0; i < files.length; i++) { System.out.println("这是第" + i + "个文件----------------"); System.out.println("完整路径:" + files[i].toString()); String fileName = files[i].getName(); String fileType = fileName.substring(fileName.lastIndexOf(".") + 1, fileName.length()).toLowerCase(); System.out.println("文件名称:" + fileName); System.out.println("文件类型:" + fileType);
Document doc = new Document();
InputStream in = new FileInputStream(files[i]); InputStreamReader reader = null;
if (fileType != null && !fileType.equals("")) {
if (fileType.equals("doc")) { WordExtractor wordExtractor = new WordExtractor(in); doc.add(new Field("contents", wordExtractor.getText(), Field.Store.YES, Field.Index.ANALYZED)); wordExtractor.close(); System.out.println("注意:已为文件“" + fileName + "”创建了索引");
} else if (fileType.equals("docx")) { XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor( new XWPFDocument(in)); doc.add(new Field("contents", xwpfWordExtractor.getText(), Field.Store.YES, Field.Index.ANALYZED)); xwpfWordExtractor.close(); System.out.println("注意:已为文件“" + fileName + "”创建了索引");
} else if (fileType.equals("pdf")) { PDFParser parser = new PDFParser(in); parser.parse(); PDDocument pdDocument = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); doc.add(new Field("contents", stripper.getText(pdDocument), Field.Store.NO, Field.Index.ANALYZED)); pdDocument.close(); System.out.println("注意:已为文件“" + fileName + "”创建了索引");
} else if (fileType.equals("txt")) { reader = new InputStreamReader(in); BufferedReader br = new BufferedReader(reader); String txtFile = ""; String line = null;
while ((line = br.readLine()) != null) { txtFile += line; } doc.add(new Field("contents", txtFile, Field.Store.NO, Field.Index.ANALYZED)); System.out.println("注意:已为文件“" + fileName + "”创建了索引");
} else {
System.out.println(); continue;
}
} doc.add(new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED)); indexWriter.addDocument(doc); System.out.println(); } System.out.println("numDocs=" + indexWriter.numDocs()); indexWriter.close();
} }
|