001package org.apache.lucene.demo; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one or more 005 * contributor license agreements. See the NOTICE file distributed with 006 * this work for additional information regarding copyright ownership. 007 * The ASF licenses this file to You under the Apache License, Version 2.0 008 * (the "License"); you may not use this file except in compliance with 009 * the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019 020import org.apache.lucene.analysis.Analyzer; 021import org.apache.lucene.analysis.standard.StandardAnalyzer; 022import org.apache.lucene.document.Document; 023import org.apache.lucene.document.Field; 024import org.apache.lucene.document.LongField; 025import org.apache.lucene.document.StringField; 026import org.apache.lucene.document.TextField; 027import org.apache.lucene.index.IndexWriter; 028import org.apache.lucene.index.IndexWriterConfig.OpenMode; 029import org.apache.lucene.index.IndexWriterConfig; 030import org.apache.lucene.index.Term; 031import org.apache.lucene.store.Directory; 032import org.apache.lucene.store.FSDirectory; 033import org.apache.lucene.util.Version; 034 035import java.io.BufferedReader; 036import java.io.File; 037import java.io.FileInputStream; 038import java.io.FileNotFoundException; 039import java.io.IOException; 040import java.io.InputStreamReader; 041import java.nio.charset.StandardCharsets; 042import java.util.Date; 043 044/** Index all text files under a directory. 045 * <p> 046 * This is a command-line application demonstrating simple Lucene indexing. 047 * Run it with no command-line arguments for usage information. 048 */ 049public class IndexFiles { 050 051 private IndexFiles() {} 052 053 /** Index all text files under a directory. */ 054 public static void main(String[] args) { 055 String usage = "java org.apache.lucene.demo.IndexFiles" 056 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" 057 + "This indexes the documents in DOCS_PATH, creating a Lucene index" 058 + "in INDEX_PATH that can be searched with SearchFiles"; 059 String indexPath = "index"; 060 String docsPath = null; 061 boolean create = true; 062 for(int i=0;i<args.length;i++) { 063 if ("-index".equals(args[i])) { 064 indexPath = args[i+1]; 065 i++; 066 } else if ("-docs".equals(args[i])) { 067 docsPath = args[i+1]; 068 i++; 069 } else if ("-update".equals(args[i])) { 070 create = false; 071 } 072 } 073 074 if (docsPath == null) { 075 System.err.println("Usage: " + usage); 076 System.exit(1); 077 } 078 079 final File docDir = new File(docsPath); 080 if (!docDir.exists() || !docDir.canRead()) { 081 System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path"); 082 System.exit(1); 083 } 084 085 Date start = new Date(); 086 try { 087 System.out.println("Indexing to directory '" + indexPath + "'..."); 088 089 Directory dir = FSDirectory.open(new File(indexPath)); 090 // :Post-Release-Update-Version.LUCENE_XY: 091 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_0); 092 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer); 093 094 if (create) { 095 // Create a new index in the directory, removing any 096 // previously indexed documents: 097 iwc.setOpenMode(OpenMode.CREATE); 098 } else { 099 // Add new documents to an existing index: 100 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 101 } 102 103 // Optional: for better indexing performance, if you 104 // are indexing many documents, increase the RAM 105 // buffer. But if you do this, increase the max heap 106 // size to the JVM (eg add -Xmx512m or -Xmx1g): 107 // 108 // iwc.setRAMBufferSizeMB(256.0); 109 110 IndexWriter writer = new IndexWriter(dir, iwc); 111 indexDocs(writer, docDir); 112 113 // NOTE: if you want to maximize search performance, 114 // you can optionally call forceMerge here. This can be 115 // a terribly costly operation, so generally it's only 116 // worth it when your index is relatively static (ie 117 // you're done adding documents to it): 118 // 119 // writer.forceMerge(1); 120 121 writer.close(); 122 123 Date end = new Date(); 124 System.out.println(end.getTime() - start.getTime() + " total milliseconds"); 125 126 } catch (IOException e) { 127 System.out.println(" caught a " + e.getClass() + 128 "\n with message: " + e.getMessage()); 129 } 130 } 131 132 /** 133 * Indexes the given file using the given writer, or if a directory is given, 134 * recurses over files and directories found under the given directory. 135 * 136 * NOTE: This method indexes one document per input file. This is slow. For good 137 * throughput, put multiple documents into your input file(s). An example of this is 138 * in the benchmark module, which can create "line doc" files, one document per line, 139 * using the 140 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 141 * >WriteLineDocTask</a>. 142 * 143 * @param writer Writer to the index where the given file/dir info will be stored 144 * @param file The file to index, or the directory to recurse into to find files to index 145 * @throws IOException If there is a low-level I/O error 146 */ 147 static void indexDocs(IndexWriter writer, File file) 148 throws IOException { 149 // do not try to index files that cannot be read 150 if (file.canRead()) { 151 if (file.isDirectory()) { 152 String[] files = file.list(); 153 // an IO error could occur 154 if (files != null) { 155 for (int i = 0; i < files.length; i++) { 156 indexDocs(writer, new File(file, files[i])); 157 } 158 } 159 } else { 160 161 FileInputStream fis; 162 try { 163 fis = new FileInputStream(file); 164 } catch (FileNotFoundException fnfe) { 165 // at least on windows, some temporary files raise this exception with an "access denied" message 166 // checking if the file can be read doesn't help 167 return; 168 } 169 170 try { 171 172 // make a new, empty document 173 Document doc = new Document(); 174 175 // Add the path of the file as a field named "path". Use a 176 // field that is indexed (i.e. searchable), but don't tokenize 177 // the field into separate words and don't index term frequency 178 // or positional information: 179 Field pathField = new StringField("path", file.getPath(), Field.Store.YES); 180 doc.add(pathField); 181 182 // Add the last modified date of the file a field named "modified". 183 // Use a LongField that is indexed (i.e. efficiently filterable with 184 // NumericRangeFilter). This indexes to milli-second resolution, which 185 // is often too fine. You could instead create a number based on 186 // year/month/day/hour/minutes/seconds, down the resolution you require. 187 // For example the long value 2011021714 would mean 188 // February 17, 2011, 2-3 PM. 189 doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); 190 191 // Add the contents of the file to a field named "contents". Specify a Reader, 192 // so that the text of the file is tokenized and indexed, but not stored. 193 // Note that FileReader expects the file to be in UTF-8 encoding. 194 // If that's not the case searching for special characters will fail. 195 doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)))); 196 197 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 198 // New index, so we just add the document (no old document can be there): 199 System.out.println("adding " + file); 200 writer.addDocument(doc); 201 } else { 202 // Existing index (an old copy of this document may have been indexed) so 203 // we use updateDocument instead to replace the old one matching the exact 204 // path, if present: 205 System.out.println("updating " + file); 206 writer.updateDocument(new Term("path", file.getPath()), doc); 207 } 208 209 } finally { 210 fis.close(); 211 } 212 } 213 } 214 } 215}