用Lucene做一个简单的Java搜索工具

　2007-12-23 12:27:00　来源：WEB开发网　閵嗭拷

核心提示： 初学LUCene，刚接触搜索引擎，用Lucene做一个简单的Java搜索工具，知道了一点点，想做个小工具，双击查询结果的某条记录可以打开相应文件，4、性能方面索引文件时，实现根据“单词”搜索某个java源文件，比如输入“String”去查询某些java源文件里用到了这个类

　初学LUCene，刚接触搜索引擎。知道了一点点，想做个小工具，实现根据“单词”搜索某个java源文件。比如输入“String”去查询某些java源文件里用到了这个类。

　这个想法的来源是，在以前刚学java时，有一本java基础教程的书的附带光盘里有作者写的一个程序，可以方便初学者查找某些类在哪个实例里出现。当时没有太在意，觉得作者的代码很长。所以现在想自己也写一个这样的小程序。

　 开发工具与运行环境：使用Lucene2.0的包，jdk1.5，在WindowsXP下运行。

　 思路分析与设计：

　整个程序里，除了Lucene的必要操作外，就是IO的基本操作了。因为要对某目录下及其子目录下的所有Java源文件进行索引，就要用到递归，同时要过滤掉非Java源文件。根据这种情况，设计了以下5个类。

主类：索引类（IndexJavaFiles），搜索类（SearchJavaFiles）异常类：索引异常类(IndexException)，搜索异常类(SearchException) 还有一个文件过滤工厂类（FileFilterFactory）。

　异常类不是必要的，特意设计来包装IO异常、文件异常和Lucene的异常。文件过滤工厂类的出现并不是故弄玄虚，只是不想太多代码集中一起，就把文件过虑器的设计放到一个类里。下面是程序的完整代码及注释。

IndexJavaFiles.java /** *indexthejavasourcefiles */ package powerwind; import java.io.*; import java.util.Date; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; /** *@authorPowerwind *@version1.0 */ publicclass IndexJavaFiles { 　 /** 　 *默认构造方法　 */ 　 public IndexJavaFiles() { 　 } 　 /** 　 * 这个私有递归方法由index方法调用，保证index传入的file是目录不是文件　 * 　 *@paramwriter 　 *@paramfile 　 *@paramff 　 *@throwsIndexException 　 */ 　 PRivatevoid indexDirectory(IndexWriter writer, File file, FileFilter filter)throws IndexException { 　　　 if (file.isDirectory()) { 　　　　　 // 有选择地（过滤）获取目录下的文件和目录　　　　　 File[] files = file.listFiles(filter); 　　　　　 // 非空目录　　　　　 if (files != null) { 　　　　　　 for (int i = 0; i < files.length; i++) { 　　　　　　　　 indexDirectory(writer, files[i], filter); 　　　　　　 } 　　　　　 } 　　　 } else { 　　　　　 try { 　　　　　　　// 这里的file经过先前的过滤　　　　　　 writer.addDocument(parseFile(file)); 　　　　　　 System.out.println("增加文件： " + file); 　　　　　 } catch (IOException ioe) { 　　　　　　 thrownew IndexException(ioe.getMessage()); 　　　　　 } 　　　 } 　 } 　 /** 　 *传参数是文件就直接索引，若是目录则交给indexDirectory递归　 * 　 *@paramwriter 　 *@paramfile 　 *@paramff 　 *@throwsIndexException 　 */ 　 publicvoid index(IndexWriter writer, File file, FileFilter filter) throws IndexException { 　　　 // 确定可读　　　 if (file.exists() && file.canRead()) { 　　　　　 if (file.isDirectory()) { 　　　　　　 indexDirectory(writer, file, filter); 　　　　　 } elseif (filter.accept(file)) { 　　　　　　 try { 　　　　　　　　 writer.addDocument(parseFile(file)); 　　　　　　　　 System.out.println("增加文件： " + file); 　　　　　　 } catch (IOException ioe) { 　　　　　　　　 thrownew IndexException(ioe.getMessage()); 　　　　　　 } 　　　　　 } else { 　　　　　　 System.out.println("指定文件或目录错误，没有完成索引"); 　　　　　 } 　　　 } 　 } 　 /** 　 *@paramfile 　 * 　 *把File变成Document 　 */ 　 private Document parseFile(File file) throws IndexException { 　　　 Document doc = new Document(); 　　　 doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, 　　　　　　　　　　 Field.Index.UN_TOKENIZED)); 　　　 try { 　　　　　 doc.add(new Field("contents", new FileReader(file))); 　　　 } catch (FileNotFoundException fnfe) { 　　　　　 thrownew IndexException(fnfe.getMessage()); 　　　 } 　　　 return doc; 　 } } 进入讨论组讨论。
index(IndexWriter writer, File file, FileFilter filter)调用私有方法indexDirectory(IndexWriter writer, File file, FileFilter filter)完成文件的索引。下面是IndexException异常类。 IndexException.java package powerwind; publicclass IndexException extends Exception { 　 public IndexException(String message) { 　　　 super("Throw IndexException while indexing files: " + message); 　 } } 下面是FileFilterFactory类，返回一个特定的文件过滤器（FileFilter）。 FileFilterFactory.java package powerwind; import java.io.*; publicclass FileFilterFactory { 　 /** 　 *静态匿名内部类　 */ 　 privatestatic FileFilter filter = new FileFilter() { 　　　 publicboolean accept(File file) { 　　　　　 long len; 　　　　　 return file.isDirectory() 　　　　　　　　 (file.getName().endsWith(".java") && 　　　　　　　　 ((len = file.length()) > 0) && len < 1024 * 1024); 　　　 } 　 }; 　 publicstatic FileFilter getFilter() { 　　　 returnfilter; 　 } } main方法　 /** 　 *　　 main方法　 */ 　 publicstaticvoid main(String[] args) throws Exception { 　　　 IndexJavaFiles ijf = new IndexJavaFiles(); 　　　 Date start = new Date(); 　　　 try { 　　　　　 IndexWriter writer = IndexWriterFactory.newInstance().createWriter("./index", true); 　　　　　 System.out.println("Indexing ..."); 　　　　　 ijf.index(writer, new File("."), FileFilterFactory.getFilter()); 　　　　　 System.out.println("Optimizing..."); 　　　　　 writer.optimize(); 　　　　　 writer.close(); 　　　　　 Date end = new Date(); 　　　　　 System.out.println(end.getTime() - start.getTime() + " total milliseconds"); 　　　 } catch (IOException e) { 　　　　　 System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); 　　　 } 　 } SearchJavaFiles.java package powerwind; import java.io.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.*; import org.apache.lucene.search.*; publicclass SearchJavaFiles { 　 private IndexSearcher searcher; 　 private QueryParser parser; 　 /** 　 * 　 *@paramsearcher 　 */ 　 public SearchJavaFiles(IndexSearcher searcher) { 　　　 this.searcher = searcher; 　 } 　 /** 　 * 　 *@paramfield 　 *@paramanalyzer 　 */ 　 publicvoid setParser(String field, Analyzer analyzer) { 　　　 setParser(new QueryParser(field, analyzer)); 　 } 　 /** 　 *@paramparser 　 */ 　 publicvoid setParser(QueryParser parser) { 　　　 this.parser = parser; 　 } 　 /** 　 * 　 *@paramquery 　 *@returnHits 　 *@throwsSearchException 　 */ 　 public Hits serach(Query query) throws SearchException { 　　　 try { 　　　　　 returnsearcher.search(query); 　　　 } catch (IOException ioe) { 　　　　　 thrownew SearchException(ioe.getMessage()); 　　　 } 　 } 　 /** 　 * 　 *@paramqueryString 　 *@returnHits 　 *@throwsSearchException 　 */ 　 public Hits serach(String queryString) throws SearchException { 　　　 if (parser == null) 　　　　　 thrownew SearchException("parser is null!"); 　　　 try { 　　　　　 returnsearcher.search(parser.parse(queryString)); 　　　 } catch (IOException ioe) { 　　　　　 thrownew SearchException(ioe.getMessage()); 　　　 } catch (ParseException pe) { 　　　　　 thrownew SearchException(pe.getMessage()); 　　　 } 　 } 　 /** 　 * 　 *输出hits的结果，从start开始到end，不包括end 　 * 　 *@paramhits 　 *@paramstart 　 *@paramend 　 *@throwsSearchException 　 */ 　 publicstatic Hits display(Hits hits, int start, int end) throws SearchException { 　　　 try { 　　　　　 while (start < end) { 　　　　　　 Document doc = hits.doc(start); 　　　　　　 String path = doc.get("path"); 　　　　　　 if (path != null) { 　　　　　　　　 System.out.println((start + 1) + "- " + path); 　　　　　　 } else { 　　　　　　　　 System.out.println((start + 1) + "- " + "No such path"); 　　　　　　 } 　　　　　　 start++; 　　　　　 } 　　　 } catch (IOException ioe) { 　　　　　 thrownew SearchException(ioe.getMessage()); 　　　 } 　　　 return hits; 　 } 进入讨论组讨论。
main方法　 /** 　 *@paramargs 　 */ 　 publicstaticvoid main(String[] args) throws Exception { 　　　 String field = "contents"; 　　　 String index = "./index"; 　　　 finalint rows_per_page = 2; 　　　 finalchar NO = 'n'; 　　　 SearchJavaFiles sjf = new SearchJavaFiles(new IndexSearcher(IndexReader.open(index))); 　　　 sjf.setParser(field, new StandardAnalyzer()); 　　　 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); 　　　 while (true) { 　　　　　 System.out.println("Query: "); 　　　　　 String line = in.readLine(); 　　　　　 if (line == null line.length() < 2) { 　　　　　　 System.out.println("eixt query"); 　　　　　　 break; 　　　　　 } 　　　　　 Hits hits = sjf.serach(line); 　　　　　 System.out.println("searching for " + line + " Result is "); 　　　　　 int len = hits.length(); 　　　　　 int i = 0; 　　　　　 if (len > 0) 　　　　　　 while (true) { 　　　　　　　　 if (i + rows_per_page >= len) { 　　　　　　　　　　 SearchJavaFiles.display(hits, i, len); 　　　　　　　　　　 break; 　　　　　　　　 } else { 　　　　　　　　　　 SearchJavaFiles.display(hits, i, i += rows_per_page); 　　　　　　　　　　 System.out.println("more y/n?"); 　　　　　　　　　　 line = in.readLine(); 　　　　　　　　　　 if (line.length() < 1 line.charAt(0) == NO) 　　　　　　　　　　　　 break; 　　　　　　　　 } 　　　　　　 } 　　　　　 else 　　　　　　 System.out.println("not found"); 　　　 } 　 } } SearchException.java package powerwind; publicclass SearchException extends Exception { 　 public SearchException(String message) { 　　　 super("Throw SearchException while searching files: " + message); 　 } }

完善设想： 1、文件格式： 能够处理Zip文件Jar文件，索引里面的java源文件。通过反射机制索引class类文件。 2、输入输出： 除控制台输入输出外，还可以选择从文件读取查询关键字，输出查询结果到文件。 3、用户界面： 图形界面操作，双击查询结果的某条记录可以打开相应文件。 4、性能方面 索引文件时，用缓存和多线程处理

进入讨论组讨论。

（出处：http://www.cncms.com）