001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.lucene.demo; 018 019import java.io.BufferedReader; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.nio.charset.StandardCharsets; 024import java.nio.file.FileVisitResult; 025import java.nio.file.Files; 026import java.nio.file.Path; 027import java.nio.file.Paths; 028import java.nio.file.SimpleFileVisitor; 029import java.nio.file.attribute.BasicFileAttributes; 030import java.util.Date; 031import org.apache.lucene.analysis.Analyzer; 032import org.apache.lucene.analysis.standard.StandardAnalyzer; 033import org.apache.lucene.demo.knn.DemoEmbeddings; 034import org.apache.lucene.demo.knn.KnnVectorDict; 035import org.apache.lucene.document.Document; 036import org.apache.lucene.document.Field; 037import org.apache.lucene.document.KnnVectorField; 038import org.apache.lucene.document.LongPoint; 039import org.apache.lucene.document.StringField; 040import org.apache.lucene.document.TextField; 041import org.apache.lucene.index.DirectoryReader; 042import org.apache.lucene.index.IndexReader; 043import org.apache.lucene.index.IndexWriter; 044import org.apache.lucene.index.IndexWriterConfig; 045import org.apache.lucene.index.IndexWriterConfig.OpenMode; 046import org.apache.lucene.index.Term; 047import org.apache.lucene.index.VectorSimilarityFunction; 048import org.apache.lucene.store.Directory; 049import org.apache.lucene.store.FSDirectory; 050import org.apache.lucene.util.IOUtils; 051 052/** 053 * Index all text files under a directory. 054 * 055 * <p>This is a command-line application demonstrating simple Lucene indexing. Run it with no 056 * command-line arguments for usage information. 057 */ 058public class IndexFiles implements AutoCloseable { 059 static final String KNN_DICT = "knn-dict"; 060 061 // Calculates embedding vectors for KnnVector search 062 private final DemoEmbeddings demoEmbeddings; 063 private final KnnVectorDict vectorDict; 064 065 private IndexFiles(KnnVectorDict vectorDict) throws IOException { 066 if (vectorDict != null) { 067 this.vectorDict = vectorDict; 068 demoEmbeddings = new DemoEmbeddings(vectorDict); 069 } else { 070 this.vectorDict = null; 071 demoEmbeddings = null; 072 } 073 } 074 075 /** Index all text files under a directory. */ 076 public static void main(String[] args) throws Exception { 077 String usage = 078 "java org.apache.lucene.demo.IndexFiles" 079 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update] [-knn_dict DICT_PATH]\n\n" 080 + "This indexes the documents in DOCS_PATH, creating a Lucene index" 081 + "in INDEX_PATH that can be searched with SearchFiles\n" 082 + "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search"; 083 String indexPath = "index"; 084 String docsPath = null; 085 String vectorDictSource = null; 086 boolean create = true; 087 for (int i = 0; i < args.length; i++) { 088 switch (args[i]) { 089 case "-index": 090 indexPath = args[++i]; 091 break; 092 case "-docs": 093 docsPath = args[++i]; 094 break; 095 case "-knn_dict": 096 vectorDictSource = args[++i]; 097 break; 098 case "-update": 099 create = false; 100 break; 101 case "-create": 102 create = true; 103 break; 104 default: 105 throw new IllegalArgumentException("unknown parameter " + args[i]); 106 } 107 } 108 109 if (docsPath == null) { 110 System.err.println("Usage: " + usage); 111 System.exit(1); 112 } 113 114 final Path docDir = Paths.get(docsPath); 115 if (!Files.isReadable(docDir)) { 116 System.out.println( 117 "Document directory '" 118 + docDir.toAbsolutePath() 119 + "' does not exist or is not readable, please check the path"); 120 System.exit(1); 121 } 122 123 Date start = new Date(); 124 try { 125 System.out.println("Indexing to directory '" + indexPath + "'..."); 126 127 Directory dir = FSDirectory.open(Paths.get(indexPath)); 128 Analyzer analyzer = new StandardAnalyzer(); 129 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 130 131 if (create) { 132 // Create a new index in the directory, removing any 133 // previously indexed documents: 134 iwc.setOpenMode(OpenMode.CREATE); 135 } else { 136 // Add new documents to an existing index: 137 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 138 } 139 140 // Optional: for better indexing performance, if you 141 // are indexing many documents, increase the RAM 142 // buffer. But if you do this, increase the max heap 143 // size to the JVM (eg add -Xmx512m or -Xmx1g): 144 // 145 // iwc.setRAMBufferSizeMB(256.0); 146 147 KnnVectorDict vectorDictInstance = null; 148 long vectorDictSize = 0; 149 if (vectorDictSource != null) { 150 KnnVectorDict.build(Paths.get(vectorDictSource), dir, KNN_DICT); 151 vectorDictInstance = new KnnVectorDict(dir, KNN_DICT); 152 vectorDictSize = vectorDictInstance.ramBytesUsed(); 153 } 154 155 try (IndexWriter writer = new IndexWriter(dir, iwc); 156 IndexFiles indexFiles = new IndexFiles(vectorDictInstance)) { 157 indexFiles.indexDocs(writer, docDir); 158 159 // NOTE: if you want to maximize search performance, 160 // you can optionally call forceMerge here. This can be 161 // a terribly costly operation, so generally it's only 162 // worth it when your index is relatively static (ie 163 // you're done adding documents to it): 164 // 165 // writer.forceMerge(1); 166 } finally { 167 IOUtils.close(vectorDictInstance); 168 } 169 170 Date end = new Date(); 171 try (IndexReader reader = DirectoryReader.open(dir)) { 172 System.out.println( 173 "Indexed " 174 + reader.numDocs() 175 + " documents in " 176 + (end.getTime() - start.getTime()) 177 + " milliseconds"); 178 if (reader.numDocs() > 100 179 && vectorDictSize < 1_000_000 180 && System.getProperty("smoketester") == null) { 181 throw new RuntimeException( 182 "Are you (ab)using the toy vector dictionary? See the package javadocs to understand why you got this exception."); 183 } 184 } 185 } catch (IOException e) { 186 System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); 187 } 188 } 189 190 /** 191 * Indexes the given file using the given writer, or if a directory is given, recurses over files 192 * and directories found under the given directory. 193 * 194 * <p>NOTE: This method indexes one document per input file. This is slow. For good throughput, 195 * put multiple documents into your input file(s). An example of this is in the benchmark module, 196 * which can create "line doc" files, one document per line, using the <a 197 * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 198 * >WriteLineDocTask</a>. 199 * 200 * @param writer Writer to the index where the given file/dir info will be stored 201 * @param path The file to index, or the directory to recurse into to find files to index 202 * @throws IOException If there is a low-level I/O error 203 */ 204 void indexDocs(final IndexWriter writer, Path path) throws IOException { 205 if (Files.isDirectory(path)) { 206 Files.walkFileTree( 207 path, 208 new SimpleFileVisitor<>() { 209 @Override 210 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { 211 try { 212 indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); 213 } catch ( 214 @SuppressWarnings("unused") 215 IOException ignore) { 216 ignore.printStackTrace(System.err); 217 // don't index files that can't be read. 218 } 219 return FileVisitResult.CONTINUE; 220 } 221 }); 222 } else { 223 indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); 224 } 225 } 226 227 /** Indexes a single document */ 228 void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { 229 try (InputStream stream = Files.newInputStream(file)) { 230 // make a new, empty document 231 Document doc = new Document(); 232 233 // Add the path of the file as a field named "path". Use a 234 // field that is indexed (i.e. searchable), but don't tokenize 235 // the field into separate words and don't index term frequency 236 // or positional information: 237 Field pathField = new StringField("path", file.toString(), Field.Store.YES); 238 doc.add(pathField); 239 240 // Add the last modified date of the file a field named "modified". 241 // Use a LongPoint that is indexed (i.e. efficiently filterable with 242 // PointRangeQuery). This indexes to milli-second resolution, which 243 // is often too fine. You could instead create a number based on 244 // year/month/day/hour/minutes/seconds, down the resolution you require. 245 // For example the long value 2011021714 would mean 246 // February 17, 2011, 2-3 PM. 247 doc.add(new LongPoint("modified", lastModified)); 248 249 // Add the contents of the file to a field named "contents". Specify a Reader, 250 // so that the text of the file is tokenized and indexed, but not stored. 251 // Note that FileReader expects the file to be in UTF-8 encoding. 252 // If that's not the case searching for special characters will fail. 253 doc.add( 254 new TextField( 255 "contents", 256 new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); 257 258 if (demoEmbeddings != null) { 259 try (InputStream in = Files.newInputStream(file)) { 260 float[] vector = 261 demoEmbeddings.computeEmbedding( 262 new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))); 263 doc.add( 264 new KnnVectorField("contents-vector", vector, VectorSimilarityFunction.DOT_PRODUCT)); 265 } 266 } 267 268 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 269 // New index, so we just add the document (no old document can be there): 270 System.out.println("adding " + file); 271 writer.addDocument(doc); 272 } else { 273 // Existing index (an old copy of this document may have been indexed) so 274 // we use updateDocument instead to replace the old one matching the exact 275 // path, if present: 276 System.out.println("updating " + file); 277 writer.updateDocument(new Term("path", file.toString()), doc); 278 } 279 } 280 } 281 282 @Override 283 public void close() throws IOException { 284 IOUtils.close(vectorDict); 285 } 286}