- 論壇徽章:
- 0
|
搜索測試
package org.surpass.test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class Search {
Date startTime, endTime;
/**
* 索引文件的存放位置,本例是放入內(nèi)存中.
*/
private Directory path = new RAMDirectory();
/**
* 創(chuàng)建索引
*/
public void createLuceneIndex() {
IndexWriter writer;
try {
writer = new IndexWriter(path, new ChineseAnalyzer(), true,
IndexWriter.MaxFieldLength.LIMITED);
Document docA = new Document();
// 相當(dāng)于數(shù)據(jù)庫中列的概念,因此第一個參數(shù)是列名,第二個參數(shù)是列的值,最后兩個參數(shù)是enum類型的(JDK1.5),對創(chuàng)建的索引的設(shè)置
// Field.Store 是否覆蓋原來的索引文件,而不是重新建一個
Field fieldA = new Field("content", "搜索引擎19:58:25", Store.YES,
Index.ANALYZED);
// 我們把列(fieldA)加到某一行(docA)中
docA.add(fieldA);
// 英文 測試
docA.add(new Field("content", "hello lucene ,I love you",
Store.YES, Index.ANALYZED));
docA.add(new Field("lastModifyTime", "2010個人 19:58:25", Store.YES,
Index.ANALYZED));
Document docB = new Document();
// 相當(dāng)于數(shù)據(jù)庫中列的概念,因此第一個參數(shù)是列名,第二個參數(shù)是列的值,最后兩個參數(shù)是enum類型的(JDK1.5),對創(chuàng)建的索引的設(shè)置
Field fieldB = new Field("content", "創(chuàng)建索引", Store.YES,
Index.ANALYZED);
// 我們把列(fieldB)加到某一行(docB)中
docB.add(fieldB);
docB.add(new Field("content", "i live in shanghai.i come from cn",
Store.YES, Index.ANALYZED));
docB.add(new Field("lastModifyTime", "2020個人", Store.YES,
Index.ANALYZED));
Document docC = new Document();
Field fieldC = new Field("content", "19:58:25", Store.YES,
Index.ANALYZED);
// 我們把列(fieldC)加到某一行(docC)中
docC.add(fieldC);
docC.add(new Field("content", "this is a test demo", Store.YES,
Index.ANALYZED));
docC.add(new Field("lastModifyTime", "2010", Store.YES,
Index.ANALYZED));
writer.addDocument(docA);
writer.addDocument(docB);
writer.addDocument(docC);
// 如果對海量數(shù)據(jù)進(jìn)行創(chuàng)建索引的時候,需要對索引進(jìn)行優(yōu)化,以便提高速度
writer.optimize();
// 跟數(shù)據(jù)庫類似,打開一個連接,使用完后,要關(guān)閉它
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 創(chuàng)建文件索引
*/
public void createIndexByFile() {
IndexWriter writer;
try {
File file = new File("test.txt");
String filePath = file.getAbsolutePath();
System.out.printf("fielPahth:====" + filePath);
System.out
.printf("\n====================================================\n");
String content = file2String(filePath, "UTF-8");
System.out.printf("content:====" + content);
System.out
.printf("\n====================================================\n");
writer = new IndexWriter(path, new ChineseAnalyzer(), true,
IndexWriter.MaxFieldLength.LIMITED);
Document docA = new Document();
Field fieldA = new Field("content", content, Field.Store.YES,
Field.Index.ANALYZED);
docA.add(new Field("path", filePath, Field.Store.YES,
Field.Index.NOT_ANALYZED));
docA.add(fieldA);
writer.addDocument(docA);
// 如果對海量數(shù)據(jù)進(jìn)行創(chuàng)建索引的時候,需要對索引進(jìn)行優(yōu)化,以便提高速度
writer.optimize();
// 跟數(shù)據(jù)庫類似,打開一個連接,使用完后,要關(guān)閉它
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
private String file2String(String fileName, String charset)
throws Exception {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(fileName), charset));
// StringBuilder ,StringBuffer
StringBuilder builder = new StringBuilder();
String line = null;
while ((line = reader.readLine()) != null) {
builder.append(line);
}
return builder.toString();
}
/**
* 相當(dāng)于sql中where 后面的條件,WildcardQuery不推薦大家使用 通配符搜索
*/
private Query wildcardQuery() {
// where username = 'lucene' and password='apache'
// ?代表至少有一個字符在前面
// 搜索"*搜*",找到一條數(shù)據(jù);搜索"*索*",找到兩條數(shù)據(jù);搜索"*搜索*",找到0條數(shù)據(jù);搜索"*索引*",找到0條數(shù)據(jù);
Term term = new Term("content", "*索*");
return new WildcardQuery(term);
}
// 基于lucene的分詞 -- TermQuery只能對單個中文進(jìn)行搜索。英文只能對當(dāng)個單詞進(jìn)行搜索
public Query termQuery() {
Term term = new Term("content", "come");
// Term term = new Term("content", "搜");
return new TermQuery(term);
}
/**
* 智能搜索
*
* @return
*/
public Query queryParser() {
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
"content", new ChineseAnalyzer());
try {
return queryParser.parse("搜索 擎");
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* '與或'--搜索
*
* @return
*/
public Query booleanQuery() {
Term term1 = new Term("content", "索");
Term term2 = new Term("content", "搜");
TermQuery tempQuery1 = new TermQuery(term1);
TermQuery tempQuery2 = new TermQuery(term2);
// 本人覺得他更應(yīng)該叫做JoinQuery
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(tempQuery1, BooleanClause.Occur.MUST);
booleanQuery.add(tempQuery2, BooleanClause.Occur.SHOULD);
return booleanQuery;
}
/**
* 多關(guān)鍵詞搜索
*
* @return
*/
public Query phraseQuery() {
PhraseQuery phraseQuery = new PhraseQuery();
phraseQuery.setSlop(1);
phraseQuery.add(new Term("content", "搜"));
phraseQuery.add(new Term("content", "擎"));
return phraseQuery;
}
/**
* 范圍搜索
*
* @return
*/
public Query rangeQuery() {
Set set = new HashSet();
SpanQuery rangeQuery = new SpanTermQuery(new Term("lastModifyTime",
"20100603"));
set.add(new Term("lastModifyTime", "20150808"));
rangeQuery.extractTerms(set);
return rangeQuery;
}
public void search() {
try {
// 相當(dāng)于sql中的 select * from talbeName
IndexSearcher search = new IndexSearcher(path);
startTime = new Date();
// 抽象的查詢對象
Query query = queryParser();
// query = wildcardQuery();
//query = termQuery();
//query = phraseQuery();
//query = booleanQuery();
// 搜索結(jié)果集和JDBC的查詢結(jié)果集完全類似的概念 -- 為什么是這樣的呢?
// lucene在設(shè)計(jì)的時候,就參照了JDBC的很多概念
TopDocs topDocs = search.search(query, 5);
if (topDocs != null) {
System.out.println("命中:" + topDocs.totalHits);
// 輸出結(jié)果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < scoreDocs.length; i++) {
try {
Document targetDoc = search.doc(scoreDocs[i].doc);
System.out.println("內(nèi)容:" + targetDoc.toString());
System.out.println(scoreDocs[i].score);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("===========================");
}
}
endTime = new Date();
System.out.println("本次搜索用時:"
+ (endTime.getTime() - startTime.getTime()) + "毫秒");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
Search search = new Search();
search.createLuceneIndex();
// search.createIndexByFile();
search.search();
}
}
text.txt手動創(chuàng)建,內(nèi)容為索引內(nèi)容. |
|