001package org.apache.lucene.demo; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one or more 005 * contributor license agreements. See the NOTICE file distributed with 006 * this work for additional information regarding copyright ownership. 007 * The ASF licenses this file to You under the Apache License, Version 2.0 008 * (the "License"); you may not use this file except in compliance with 009 * the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019 020import org.apache.lucene.analysis.Analyzer; 021import org.apache.lucene.analysis.standard.StandardAnalyzer; 022import org.apache.lucene.document.Document; 023import org.apache.lucene.document.Field; 024import org.apache.lucene.document.LongField; 025import org.apache.lucene.document.StringField; 026import org.apache.lucene.document.TextField; 027import org.apache.lucene.index.IndexWriter; 028import org.apache.lucene.index.IndexWriterConfig.OpenMode; 029import org.apache.lucene.index.IndexWriterConfig; 030import org.apache.lucene.index.Term; 031import org.apache.lucene.store.Directory; 032import org.apache.lucene.store.FSDirectory; 033import org.apache.lucene.util.Version; 034 035import java.io.BufferedReader; 036import java.io.File; 037import java.io.FileInputStream; 038import java.io.FileNotFoundException; 039import java.io.IOException; 040import java.io.InputStreamReader; 041import java.util.Date; 042 043/** Index all text files under a directory. 044 * <p> 045 * This is a command-line application demonstrating simple Lucene indexing. 046 * Run it with no command-line arguments for usage information. 047 */ 048public class IndexFiles { 049 050 private IndexFiles() {} 051 052 /** Index all text files under a directory. */ 053 public static void main(String[] args) { 054 String usage = "java org.apache.lucene.demo.IndexFiles" 055 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" 056 + "This indexes the documents in DOCS_PATH, creating a Lucene index" 057 + "in INDEX_PATH that can be searched with SearchFiles"; 058 String indexPath = "index"; 059 String docsPath = null; 060 boolean create = true; 061 for(int i=0;i<args.length;i++) { 062 if ("-index".equals(args[i])) { 063 indexPath = args[i+1]; 064 i++; 065 } else if ("-docs".equals(args[i])) { 066 docsPath = args[i+1]; 067 i++; 068 } else if ("-update".equals(args[i])) { 069 create = false; 070 } 071 } 072 073 if (docsPath == null) { 074 System.err.println("Usage: " + usage); 075 System.exit(1); 076 } 077 078 final File docDir = new File(docsPath); 079 if (!docDir.exists() || !docDir.canRead()) { 080 System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path"); 081 System.exit(1); 082 } 083 084 Date start = new Date(); 085 try { 086 System.out.println("Indexing to directory '" + indexPath + "'..."); 087 088 Directory dir = FSDirectory.open(new File(indexPath)); 089 // :Post-Release-Update-Version.LUCENE_XY: 090 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); 091 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); 092 093 if (create) { 094 // Create a new index in the directory, removing any 095 // previously indexed documents: 096 iwc.setOpenMode(OpenMode.CREATE); 097 } else { 098 // Add new documents to an existing index: 099 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 100 } 101 102 // Optional: for better indexing performance, if you 103 // are indexing many documents, increase the RAM 104 // buffer. But if you do this, increase the max heap 105 // size to the JVM (eg add -Xmx512m or -Xmx1g): 106 // 107 // iwc.setRAMBufferSizeMB(256.0); 108 109 IndexWriter writer = new IndexWriter(dir, iwc); 110 indexDocs(writer, docDir); 111 112 // NOTE: if you want to maximize search performance, 113 // you can optionally call forceMerge here. This can be 114 // a terribly costly operation, so generally it's only 115 // worth it when your index is relatively static (ie 116 // you're done adding documents to it): 117 // 118 // writer.forceMerge(1); 119 120 writer.close(); 121 122 Date end = new Date(); 123 System.out.println(end.getTime() - start.getTime() + " total milliseconds"); 124 125 } catch (IOException e) { 126 System.out.println(" caught a " + e.getClass() + 127 "\n with message: " + e.getMessage()); 128 } 129 } 130 131 /** 132 * Indexes the given file using the given writer, or if a directory is given, 133 * recurses over files and directories found under the given directory. 134 * 135 * NOTE: This method indexes one document per input file. This is slow. For good 136 * throughput, put multiple documents into your input file(s). An example of this is 137 * in the benchmark module, which can create "line doc" files, one document per line, 138 * using the 139 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 140 * >WriteLineDocTask</a>. 141 * 142 * @param writer Writer to the index where the given file/dir info will be stored 143 * @param file The file to index, or the directory to recurse into to find files to index 144 * @throws IOException If there is a low-level I/O error 145 */ 146 static void indexDocs(IndexWriter writer, File file) 147 throws IOException { 148 // do not try to index files that cannot be read 149 if (file.canRead()) { 150 if (file.isDirectory()) { 151 String[] files = file.list(); 152 // an IO error could occur 153 if (files != null) { 154 for (int i = 0; i < files.length; i++) { 155 indexDocs(writer, new File(file, files[i])); 156 } 157 } 158 } else { 159 160 FileInputStream fis; 161 try { 162 fis = new FileInputStream(file); 163 } catch (FileNotFoundException fnfe) { 164 // at least on windows, some temporary files raise this exception with an "access denied" message 165 // checking if the file can be read doesn't help 166 return; 167 } 168 169 try { 170 171 // make a new, empty document 172 Document doc = new Document(); 173 174 // Add the path of the file as a field named "path". Use a 175 // field that is indexed (i.e. searchable), but don't tokenize 176 // the field into separate words and don't index term frequency 177 // or positional information: 178 Field pathField = new StringField("path", file.getPath(), Field.Store.YES); 179 doc.add(pathField); 180 181 // Add the last modified date of the file a field named "modified". 182 // Use a LongField that is indexed (i.e. efficiently filterable with 183 // NumericRangeFilter). This indexes to milli-second resolution, which 184 // is often too fine. You could instead create a number based on 185 // year/month/day/hour/minutes/seconds, down the resolution you require. 186 // For example the long value 2011021714 would mean 187 // February 17, 2011, 2-3 PM. 188 doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); 189 190 // Add the contents of the file to a field named "contents". Specify a Reader, 191 // so that the text of the file is tokenized and indexed, but not stored. 192 // Note that FileReader expects the file to be in UTF-8 encoding. 193 // If that's not the case searching for special characters will fail. 194 doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); 195 196 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 197 // New index, so we just add the document (no old document can be there): 198 System.out.println("adding " + file); 199 writer.addDocument(doc); 200 } else { 201 // Existing index (an old copy of this document may have been indexed) so 202 // we use updateDocument instead to replace the old one matching the exact 203 // path, if present: 204 System.out.println("updating " + file); 205 writer.updateDocument(new Term("path", file.getPath()), doc); 206 } 207 208 } finally { 209 fis.close(); 210 } 211 } 212 } 213 } 214}