package TestLucene; import java.io.File; import java.io.FileReader; import java.io.Reader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; /** * This class demonstrate the process of creating index with Lucene * for text files */ public class TxtFileIndex { public static void main(String[] args) throws Exception{ //设置索引地址 File indexDir = new File("D:\\luceneIndex"); //设置数据地址 File dataDir = new File("D:\\luceneData"); //建立分词 Analyzer luceneAnalyzer = new StandardAnalyzer(); //取得目录下所有Files File[] dataFiles = dataDir.listFiles(); //建立indexWrite indexWrite主要作用是添加索引 IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,true); //取得程序开启时间 long startTime = new Date().getTime(); //循环文件 for(int i = 0; i < dataFiles.length; i++){ //取出txt后缀的文档 if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){ System.out.println("Indexing file " + dataFiles[i].getCanonicalPath()); //新建一个Document Document document = new Document(); //读取数据 Reader txtReader = new FileReader(dataFiles[i]); //Document添加path document.add(new Field("path", dataFiles[i].getCanonicalPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)); //Document添加正文 document.add(new Field("contents",txtReader)); //添加索引 indexWriter.addDocument(document); } } indexWriter.optimize(); indexWriter.close(); long endTime = new Date().getTime(); //输出程序所用时间 System.out.println("It takes " + (endTime - startTime) + " milliseconds to create index for the files in directory " + dataDir.getPath()); } }
//pizza package TestLucene; import java.io.File; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.FSDirectory; /** * This class is used to demonstrate the * process of searching on an existing * Lucene index * */ public class TxtFileSearcher { public static void main(String[] args) throws Exception{ Logger logger=Logger.getLogger(TxtFileSearcher.class); //要查询的词组 String queryStr = "com.log4j.test.TestLog.main"; //索引地址 File indexDir = new File("D:\\luceneIndex"); //取得索引字典 FSDirectory directory = FSDirectory.getDirectory(indexDir,false); //建立查询 IndexSearcher searcher = new IndexSearcher(directory); //查询的索引地址是否存在 if(!indexDir.exists()){ System.out.println("The Lucene index is not exist"); return; } //建立term 查询docuemnt中contents中的内容(内容要转为大字) Term term = new Term("contents",queryStr.toLowerCase()); //进行查询 TermQuery luceneQuery = new TermQuery(term); //生成结果 Hits hits = searcher.search(luceneQuery); for(int i = 0; i < hits.length(); i++){ //取得结果中的dowuemnt Document document = hits.doc(i); //取得返回的path属性 System.out.println("File: " + document.get("path")); } } } Lucene 的四大索引查询 清单1:使用布尔操作符
//Test boolean operator public void testOperator(String indexDirectory) throws Exception{ Directory dir = FSDirectory.getDirectory(indexDirectory,false); IndexSearcher indexSearcher = new IndexSearcher(dir); String[] searchWords = {"Java AND Lucene", "Java NOT Lucene", "Java OR Lucene", "+Java +Lucene", "+Java -Lucene"}; Analyzer language = new StandardAnalyzer(); Query query; for(int i = 0; i < searchWords.length; i++){ query = QueryParser.parse(searchWords[i], "title", language); Hits results = indexSearcher.search(query); System.out.println(results.length() + "search results for query " + searchWords[i]); } } 域搜索(Field Search) Lucene 支持域搜索,你可以指定一次查询是在哪些域(Field)上进行。例如,如果索引的文档包含两个域,Title 和 Content,你就可以使用查询 “Title: Lucene AND Content: Java” 来返回所有在 Title 域上包含 Lucene 并且在 Content 域上包含 Java 的文档。清单 2 显示了如何利用 Lucene 的 API 来实现域搜索。 清单2:实现域搜索
//Test field search public void testFieldSearch(String indexDirectory) throws Exception{ Directory dir = FSDirectory.getDirectory(indexDirectory,false); IndexSearcher indexSearcher = new IndexSearcher(dir); String searchWords = "title:Lucene AND content:Java"; Analyzer language = new StandardAnalyzer(); Query query = QueryParser.parse(searchWords, "title", language); Hits results = indexSearcher.search(query); System.out.println(results.length() + "search results for query " + searchWords); } 通配符搜索(Wildcard Search) Lucene 支持两种通配符:问号(?)和星号(*)。你可以使用问号(?)来进行单字符的通配符查询,或者利用星号(*)进行多字符的通配符查询。例如,如果你想搜索 tiny 或者 tony,你就可以使用查询语句 “t?ny”;如果你想查询 Teach, Teacher 和 Teaching,你就可以使用查询语句 “Teach*”。清单3 显示了通配符查询的过程。 清单3:进行通配符查询
//Test wildcard search public void testWildcardSearch(String indexDirectory)throws Exception{ Directory dir = FSDirectory.getDirectory(indexDirectory,false); IndexSearcher indexSearcher = new IndexSearcher(dir); String[] searchWords = {"tex*", "tex?", "?ex*"}; Query query; for(int i = 0; i < searchWords.length; i++){ query = new WildcardQuery(new Term("title",searchWords[i])); Hits results = indexSearcher.search(query); System.out.println(results.length() + "search results for query " + searchWords[i]); } } 模糊查询 Lucene 提供的模糊查询基于编辑距离算法(Edit distance algorithm)。你可以在搜索词的尾部加上字符 ~ 来进行模糊查询。例如,查询语句 “think~” 返回所有包含和 think 类似的关键词的文档。清单 4 显示了如果利用 Lucene 的 API 进行模糊查询的代码。 清单4:实现模糊查询
//Test fuzzy search public void testFuzzySearch(String indexDirectory)throws Exception{ Directory dir = FSDirectory.getDirectory(indexDirectory,false); IndexSearcher indexSearcher = new IndexSearcher(dir); String[] searchWords = {"text", "funny"}; Query query; for(int i = 0; i < searchWords.length; i++){ query = new FuzzyQuery(new Term("title",searchWords[i])); Hits results = indexSearcher.search(query); System.out.println(results.length() + "search results for query " + searchWords[i]); } } 范围搜索(Range Search) 范围搜索匹配某个域上的值在一定范围的文档。例如,查询 “age:[18 TO 35]” 返回所有 age 域上的值在 18 到 35 之间的文档。清单5显示了利用 Lucene 的 API 进行返回搜索的过程。 清单5:测试范围搜索
//Test range search public void testRangeSearch(String indexDirectory)throws Exception{ Directory dir = FSDirectory.getDirectory(indexDirectory,false); IndexSearcher indexSearcher = new IndexSearcher(dir); Term begin = new Term("birthDay","20000101"); Term end = new Term("birthDay","20060606"); Query query = new RangeQuery(begin,end,true); Hits results = indexSearcher.search(query); System.out.println(results.length() + "search results is returned"); } |
|