001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.lucene.demo; 018 019import java.io.BufferedReader; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.nio.charset.StandardCharsets; 024import java.nio.file.FileVisitResult; 025import java.nio.file.Files; 026import java.nio.file.Path; 027import java.nio.file.Paths; 028import java.nio.file.SimpleFileVisitor; 029import java.nio.file.attribute.BasicFileAttributes; 030import java.util.Date; 031import org.apache.lucene.analysis.Analyzer; 032import org.apache.lucene.analysis.standard.StandardAnalyzer; 033import org.apache.lucene.demo.knn.DemoEmbeddings; 034import org.apache.lucene.demo.knn.KnnVectorDict; 035import org.apache.lucene.document.Document; 036import org.apache.lucene.document.Field; 037import org.apache.lucene.document.KnnFloatVectorField; 038import org.apache.lucene.document.LongField; 039import org.apache.lucene.document.StringField; 040import org.apache.lucene.document.TextField; 041import org.apache.lucene.index.DirectoryReader; 042import org.apache.lucene.index.IndexReader; 043import org.apache.lucene.index.IndexWriter; 044import org.apache.lucene.index.IndexWriterConfig; 045import org.apache.lucene.index.IndexWriterConfig.OpenMode; 046import org.apache.lucene.index.Term; 047import org.apache.lucene.index.VectorSimilarityFunction; 048import org.apache.lucene.store.Directory; 049import org.apache.lucene.store.FSDirectory; 050import org.apache.lucene.util.IOUtils; 051 052/** 053 * Index all text files under a directory. 054 * 055 * <p>This is a command-line application demonstrating simple Lucene indexing. Run it with no 056 * command-line arguments for usage information. 057 */ 058public class IndexFiles implements AutoCloseable { 059 static final String KNN_DICT = "knn-dict"; 060 061 // Calculates embedding vectors for KnnVector search 062 private final DemoEmbeddings demoEmbeddings; 063 private final KnnVectorDict vectorDict; 064 065 private IndexFiles(KnnVectorDict vectorDict) throws IOException { 066 if (vectorDict != null) { 067 this.vectorDict = vectorDict; 068 demoEmbeddings = new DemoEmbeddings(vectorDict); 069 } else { 070 this.vectorDict = null; 071 demoEmbeddings = null; 072 } 073 } 074 075 /** Index all text files under a directory. */ 076 public static void main(String[] args) throws Exception { 077 String usage = 078 "java org.apache.lucene.demo.IndexFiles" 079 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update] [-knn_dict DICT_PATH]\n\n" 080 + "This indexes the documents in DOCS_PATH, creating a Lucene index" 081 + "in INDEX_PATH that can be searched with SearchFiles\n" 082 + "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search"; 083 String indexPath = "index"; 084 String docsPath = null; 085 String vectorDictSource = null; 086 boolean create = true; 087 for (int i = 0; i < args.length; i++) { 088 switch (args[i]) { 089 case "-index": 090 indexPath = args[++i]; 091 break; 092 case "-docs": 093 docsPath = args[++i]; 094 break; 095 case "-knn_dict": 096 vectorDictSource = args[++i]; 097 break; 098 case "-update": 099 create = false; 100 break; 101 case "-create": 102 create = true; 103 break; 104 default: 105 throw new IllegalArgumentException("unknown parameter " + args[i]); 106 } 107 } 108 109 if (docsPath == null) { 110 System.err.println("Usage: " + usage); 111 System.exit(1); 112 } 113 114 final Path docDir = Paths.get(docsPath); 115 if (!Files.isReadable(docDir)) { 116 System.out.println( 117 "Document directory '" 118 + docDir.toAbsolutePath() 119 + "' does not exist or is not readable, please check the path"); 120 System.exit(1); 121 } 122 123 Date start = new Date(); 124 try { 125 System.out.println("Indexing to directory '" + indexPath + "'..."); 126 127 Directory dir = FSDirectory.open(Paths.get(indexPath)); 128 Analyzer analyzer = new StandardAnalyzer(); 129 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 130 131 if (create) { 132 // Create a new index in the directory, removing any 133 // previously indexed documents: 134 iwc.setOpenMode(OpenMode.CREATE); 135 } else { 136 // Add new documents to an existing index: 137 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 138 } 139 140 // Optional: for better indexing performance, if you 141 // are indexing many documents, increase the RAM 142 // buffer. But if you do this, increase the max heap 143 // size to the JVM (eg add -Xmx512m or -Xmx1g): 144 // 145 // iwc.setRAMBufferSizeMB(256.0); 146 147 KnnVectorDict vectorDictInstance = null; 148 long vectorDictSize = 0; 149 if (vectorDictSource != null) { 150 KnnVectorDict.build(Paths.get(vectorDictSource), dir, KNN_DICT); 151 vectorDictInstance = new KnnVectorDict(dir, KNN_DICT); 152 vectorDictSize = vectorDictInstance.ramBytesUsed(); 153 } 154 155 try (IndexWriter writer = new IndexWriter(dir, iwc); 156 IndexFiles indexFiles = new IndexFiles(vectorDictInstance)) { 157 indexFiles.indexDocs(writer, docDir); 158 159 // NOTE: if you want to maximize search performance, 160 // you can optionally call forceMerge here. This can be 161 // a terribly costly operation, so generally it's only 162 // worth it when your index is relatively static (ie 163 // you're done adding documents to it): 164 // 165 // writer.forceMerge(1); 166 } finally { 167 IOUtils.close(vectorDictInstance); 168 } 169 170 Date end = new Date(); 171 try (IndexReader reader = DirectoryReader.open(dir)) { 172 System.out.println( 173 "Indexed " 174 + reader.numDocs() 175 + " documents in " 176 + (end.getTime() - start.getTime()) 177 + " ms"); 178 if (reader.numDocs() > 100 179 && vectorDictSize < 1_000_000 180 && System.getProperty("smoketester") == null) { 181 throw new RuntimeException( 182 "Are you (ab)using the toy vector dictionary? See the package javadocs to understand why you got this exception."); 183 } 184 } 185 } catch (IOException e) { 186 System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); 187 } 188 } 189 190 /** 191 * Indexes the given file using the given writer, or if a directory is given, recurses over files 192 * and directories found under the given directory. 193 * 194 * <p>NOTE: This method indexes one document per input file. This is slow. For good throughput, 195 * put multiple documents into your input file(s). An example of this is in the benchmark module, 196 * which can create "line doc" files, one document per line, using the <a 197 * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 198 * >WriteLineDocTask</a>. 199 * 200 * @param writer Writer to the index where the given file/dir info will be stored 201 * @param path The file to index, or the directory to recurse into to find files to index 202 * @throws IOException If there is a low-level I/O error 203 */ 204 void indexDocs(final IndexWriter writer, Path path) throws IOException { 205 if (Files.isDirectory(path)) { 206 Files.walkFileTree( 207 path, 208 new SimpleFileVisitor<>() { 209 @Override 210 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { 211 try { 212 indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); 213 } catch ( 214 @SuppressWarnings("unused") 215 IOException ignore) { 216 ignore.printStackTrace(System.err); 217 // don't index files that can't be read. 218 } 219 return FileVisitResult.CONTINUE; 220 } 221 }); 222 } else { 223 indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); 224 } 225 } 226 227 /** Indexes a single document */ 228 void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { 229 try (InputStream stream = Files.newInputStream(file)) { 230 // make a new, empty document 231 Document doc = new Document(); 232 233 // Add the path of the file as a field named "path". Use a 234 // field that is indexed (i.e. searchable), but don't tokenize 235 // the field into separate words and don't index term frequency 236 // or positional information: 237 Field pathField = new StringField("path", file.toString(), Field.Store.YES); 238 doc.add(pathField); 239 240 // Add the last modified date of the file a field named "modified". 241 // Use a LongField that is indexed with points and doc values, and is efficient 242 // for both filtering (LongField#newRangeQuery) and sorting 243 // (LongField#newSortField). This indexes to milli-second resolution, which 244 // is often too fine. You could instead create a number based on 245 // year/month/day/hour/minutes/seconds, down the resolution you require. 246 // For example the long value 2011021714 would mean 247 // February 17, 2011, 2-3 PM. 248 doc.add(new LongField("modified", lastModified)); 249 250 // Add the contents of the file to a field named "contents". Specify a Reader, 251 // so that the text of the file is tokenized and indexed, but not stored. 252 // Note that FileReader expects the file to be in UTF-8 encoding. 253 // If that's not the case searching for special characters will fail. 254 doc.add( 255 new TextField( 256 "contents", 257 new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); 258 259 if (demoEmbeddings != null) { 260 try (InputStream in = Files.newInputStream(file)) { 261 float[] vector = 262 demoEmbeddings.computeEmbedding( 263 new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))); 264 doc.add( 265 new KnnFloatVectorField( 266 "contents-vector", vector, VectorSimilarityFunction.DOT_PRODUCT)); 267 } 268 } 269 270 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 271 // New index, so we just add the document (no old document can be there): 272 System.out.println("adding " + file); 273 writer.addDocument(doc); 274 } else { 275 // Existing index (an old copy of this document may have been indexed) so 276 // we use updateDocument instead to replace the old one matching the exact 277 // path, if present: 278 System.out.println("updating " + file); 279 writer.updateDocument(new Term("path", file.toString()), doc); 280 } 281 } 282 } 283 284 @Override 285 public void close() throws IOException { 286 IOUtils.close(vectorDict); 287 } 288}