001 package org.apache.lucene.demo; 002 003 /* 004 * Licensed to the Apache Software Foundation (ASF) under one or more 005 * contributor license agreements. See the NOTICE file distributed with 006 * this work for additional information regarding copyright ownership. 007 * The ASF licenses this file to You under the Apache License, Version 2.0 008 * (the "License"); you may not use this file except in compliance with 009 * the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019 020 import org.apache.lucene.analysis.Analyzer; 021 import org.apache.lucene.analysis.standard.StandardAnalyzer; 022 import org.apache.lucene.document.Document; 023 import org.apache.lucene.document.Field; 024 import org.apache.lucene.document.LongField; 025 import org.apache.lucene.document.StringField; 026 import org.apache.lucene.document.TextField; 027 import org.apache.lucene.index.IndexWriter; 028 import org.apache.lucene.index.IndexWriterConfig.OpenMode; 029 import org.apache.lucene.index.IndexWriterConfig; 030 import org.apache.lucene.index.Term; 031 import org.apache.lucene.store.Directory; 032 import org.apache.lucene.store.FSDirectory; 033 import org.apache.lucene.util.Version; 034 035 import java.io.BufferedReader; 036 import java.io.File; 037 import java.io.FileInputStream; 038 import java.io.FileNotFoundException; 039 import java.io.IOException; 040 import java.io.InputStreamReader; 041 import java.util.Date; 042 043 /** Index all text files under a directory. 044 * <p> 045 * This is a command-line application demonstrating simple Lucene indexing. 046 * Run it with no command-line arguments for usage information. 047 */ 048 public class IndexFiles { 049 050 private IndexFiles() {} 051 052 /** Index all text files under a directory. */ 053 public static void main(String[] args) { 054 String usage = "java org.apache.lucene.demo.IndexFiles" 055 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" 056 + "This indexes the documents in DOCS_PATH, creating a Lucene index" 057 + "in INDEX_PATH that can be searched with SearchFiles"; 058 String indexPath = "index"; 059 String docsPath = null; 060 boolean create = true; 061 for(int i=0;i<args.length;i++) { 062 if ("-index".equals(args[i])) { 063 indexPath = args[i+1]; 064 i++; 065 } else if ("-docs".equals(args[i])) { 066 docsPath = args[i+1]; 067 i++; 068 } else if ("-update".equals(args[i])) { 069 create = false; 070 } 071 } 072 073 if (docsPath == null) { 074 System.err.println("Usage: " + usage); 075 System.exit(1); 076 } 077 078 final File docDir = new File(docsPath); 079 if (!docDir.exists() || !docDir.canRead()) { 080 System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path"); 081 System.exit(1); 082 } 083 084 Date start = new Date(); 085 try { 086 System.out.println("Indexing to directory '" + indexPath + "'..."); 087 088 Directory dir = FSDirectory.open(new File(indexPath)); 089 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); 090 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); 091 092 if (create) { 093 // Create a new index in the directory, removing any 094 // previously indexed documents: 095 iwc.setOpenMode(OpenMode.CREATE); 096 } else { 097 // Add new documents to an existing index: 098 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 099 } 100 101 // Optional: for better indexing performance, if you 102 // are indexing many documents, increase the RAM 103 // buffer. But if you do this, increase the max heap 104 // size to the JVM (eg add -Xmx512m or -Xmx1g): 105 // 106 // iwc.setRAMBufferSizeMB(256.0); 107 108 IndexWriter writer = new IndexWriter(dir, iwc); 109 indexDocs(writer, docDir); 110 111 // NOTE: if you want to maximize search performance, 112 // you can optionally call forceMerge here. This can be 113 // a terribly costly operation, so generally it's only 114 // worth it when your index is relatively static (ie 115 // you're done adding documents to it): 116 // 117 // writer.forceMerge(1); 118 119 writer.close(); 120 121 Date end = new Date(); 122 System.out.println(end.getTime() - start.getTime() + " total milliseconds"); 123 124 } catch (IOException e) { 125 System.out.println(" caught a " + e.getClass() + 126 "\n with message: " + e.getMessage()); 127 } 128 } 129 130 /** 131 * Indexes the given file using the given writer, or if a directory is given, 132 * recurses over files and directories found under the given directory. 133 * 134 * NOTE: This method indexes one document per input file. This is slow. For good 135 * throughput, put multiple documents into your input file(s). An example of this is 136 * in the benchmark module, which can create "line doc" files, one document per line, 137 * using the 138 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 139 * >WriteLineDocTask</a>. 140 * 141 * @param writer Writer to the index where the given file/dir info will be stored 142 * @param file The file to index, or the directory to recurse into to find files to index 143 * @throws IOException If there is a low-level I/O error 144 */ 145 static void indexDocs(IndexWriter writer, File file) 146 throws IOException { 147 // do not try to index files that cannot be read 148 if (file.canRead()) { 149 if (file.isDirectory()) { 150 String[] files = file.list(); 151 // an IO error could occur 152 if (files != null) { 153 for (int i = 0; i < files.length; i++) { 154 indexDocs(writer, new File(file, files[i])); 155 } 156 } 157 } else { 158 159 FileInputStream fis; 160 try { 161 fis = new FileInputStream(file); 162 } catch (FileNotFoundException fnfe) { 163 // at least on windows, some temporary files raise this exception with an "access denied" message 164 // checking if the file can be read doesn't help 165 return; 166 } 167 168 try { 169 170 // make a new, empty document 171 Document doc = new Document(); 172 173 // Add the path of the file as a field named "path". Use a 174 // field that is indexed (i.e. searchable), but don't tokenize 175 // the field into separate words and don't index term frequency 176 // or positional information: 177 Field pathField = new StringField("path", file.getPath(), Field.Store.YES); 178 doc.add(pathField); 179 180 // Add the last modified date of the file a field named "modified". 181 // Use a LongField that is indexed (i.e. efficiently filterable with 182 // NumericRangeFilter). This indexes to milli-second resolution, which 183 // is often too fine. You could instead create a number based on 184 // year/month/day/hour/minutes/seconds, down the resolution you require. 185 // For example the long value 2011021714 would mean 186 // February 17, 2011, 2-3 PM. 187 doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); 188 189 // Add the contents of the file to a field named "contents". Specify a Reader, 190 // so that the text of the file is tokenized and indexed, but not stored. 191 // Note that FileReader expects the file to be in UTF-8 encoding. 192 // If that's not the case searching for special characters will fail. 193 doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); 194 195 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 196 // New index, so we just add the document (no old document can be there): 197 System.out.println("adding " + file); 198 writer.addDocument(doc); 199 } else { 200 // Existing index (an old copy of this document may have been indexed) so 201 // we use updateDocument instead to replace the old one matching the exact 202 // path, if present: 203 System.out.println("updating " + file); 204 writer.updateDocument(new Term("path", file.getPath()), doc); 205 } 206 207 } finally { 208 fis.close(); 209 } 210 } 211 } 212 } 213 }