2.1 索引过程图解
2.2 索引建立步骤
1.创建Directory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
package
com.mzsx.write;
import
java.io.File;
import
java.io.IOException;
import
org.apache.lucene.store.Directory;
import
org.apache.lucene.store.FSDirectory;
public
class
DirectoryConext {
privatestatic Directory directory=
null
;
privateDirectoryConext(){}
publicstatic Directory getDirectory(String fileName){
if
(directory==
null
) {
synchronized
(DirectoryConext.
class
){
if
(directory==
null
) {
try
{
directory=FSDirectory.open(
new
File(fileName));
}
catch
(IOException e) {
e.printStackTrace();
}
}
}
}
returndirectory;
}
}
|
2. 创建Writer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
package
com.mzsx.write;
import
java.io.IOException;
import
org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.index.CorruptIndexException;
import
org.apache.lucene.index.IndexWriter;
import
org.apache.lucene.index.IndexWriterConfig;
import
org.apache.lucene.store.Directory;
importorg.apache.lucene.store.LockObtainFailedException;
import
org.apache.lucene.util.Version;
public
class
IndexWriterContext {
privatestatic IndexWriter indexWrite=
null
;
privatestatic Directory directory=
null
;
privatestatic Analyzer analyzer=
null
;
privateIndexWriterContext(){}
publicstatic IndexWriter getIndexWrite(String fileName,Analyzer a){
try
{
if
(indexWrite==
null
) {
directory=DirectoryConext.getDirectory(fileName);
synchronized
(IndexWriterContext.
class
){
if
(indexWrite==
null
) {
indexWrite=newIndexWriter(directory,
new
IndexWriterConfig(Version.LUCENE_35,a));
//indexWrite.commit();
}
}
}
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(LockObtainFailedException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
returnindexWrite;
}
publicstatic IndexWriter getIndexWrite(Directory dir,Analyzer a){
try
{
if
(indexWrite==
null
) {
directory=dir;
synchronized
(IndexWriterContext.
class
){
if
(indexWrite==
null
) {
indexWrite=newIndexWriter(directory,
new
IndexWriterConfig(Version.LUCENE_35,a));
}
}
}
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(LockObtainFailedException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
returnindexWrite;
}
}
|
3. 创建文档并且添加索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
// 创建索引
publicvoid createdIndex(String fName) {
try
{
indexWriter.deleteAll();
Filefile =
new
File(fName);
if
(!file.isDirectory()) {
try
{
thrownew Exception(
"您传入的不是一个目录路径。。。"
);
}
catch
(Exception e) {
e.printStackTrace();
}
}
for
(File f : file.listFiles()) {
Document doc =getDocument(f);
indexWriter.addDocument(doc);
}
indexWriter.commit();
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
catch
(Exception e) {
e.printStackTrace();
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
// 遍历文件生产document
protectedDocument getDocument(File f)
throws
Exception {
//System.out.println(FileUtils.readFileToString(f));
Documentdoc =
new
Document();
doc.add(newField(
"id"
, (
""
+ (id++)), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(newField(
"contents"
, FileUtils.readFileToString(f),
Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
doc.add(newField(
"filename"
, f.getName(), Field.Store.YES,
Field.Index.ANALYZED));
doc.add(newField(
"fullpath"
, f.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(newNumericField(
"size"
, Field.Store.YES,
true
).setLongValue(f.length()));
doc.add(newNumericField(
"date"
, Field.Store.YES,
true
).setLongValue(f.lastModified()));
returndoc;
}
|
4. 查询索引的基本信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
// 查询文件数量
publicvoid queryNum() {
try
{
IndexReaderindexReader=IndexReader.open(directory);
IndexSearchersearcher =
new
IndexSearcher(indexReader);
System.out.println(
"searcher.maxDoc="
+ searcher.maxDoc());
System.out.println(
"indexReader.maxDoc="
+indexReader.maxDoc());
System.out.println(
"indexReader.numDocs="
+ indexReader.numDocs());
System.out.println(
"indexReader.numDeletedDocs="
+indexReader.numDeletedDocs());
searcher.close();
}
catch
(IOException e) {
e.printStackTrace();
}
}
|
5. 删除和更新索引
索引的删除主要包含了IndexWriter和IndexReader删除。但是IndexWriter是2.9版本周出现的其本质还是调用IndexReader进行删除操作。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
// 更新索引
publicvoid update(String field, String name) {
Documentdocu =
new
Document();
docu.add(newField(
"id"
,
"2222"
, Field.Store.YES,
Field.Index.NOT_ANALYZED));
docu.add(newField(
"contents"
,
"修改后的文件内容"
, Field.Store.NO,
Field.Index.ANALYZED_NO_NORMS));
docu.add(newField(
"filename"
,
"这是修改后的文件名"
, Field.Store.YES,
Field.Index.NOT_ANALYZED));
docu.add(newField(
"fullpath"
,
"这是修改后的文件后的文件路径"
, Field.Store.YES,
Field.Index.NOT_ANALYZED));
try
{
indexWriter.updateDocument(newTerm(field, name), docu,analyzer);
indexWriter.commit();
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
|
//删除指定ID
publicvoid deleteByIndexWriter(String field, String value){
try
{
indexWriter.deleteDocuments(newTerm(field,value));
indexWriter.commit();
//indexWriter.close();
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
|
//删除指定ID
publicvoid deleteByIndexReader(String field, String value){
try
{
indexReader.deleteDocuments(newTerm(field,value));
//必须close()
indexReader.close();
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
|
//删除恢复
publicvoid unDelete(){
try
{
indexReader.undeleteAll();
//必须close()
indexReader.close();
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
}
|
2.3 域选项
1. 域索引选项
使用Field.Index.*来进行操作
Index.ANALYZED:进行分词和索引,适用于标题、内容等
Index.NOT_ANALYZED:进行索引,但是不进行分词,如果***号,姓名,ID等,适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引
注:没有norms意味着索引阶段禁用了文档boost和域的boost及长度标准化。好处在于节省内存,不用在搜索阶段为索引中的每篇文档的每个域都占用一个字节来保存norms信息了。但是对norms信息的禁用是必须全部域都禁用的,一旦有一个域不禁用,则其他禁用的域也会存放默认的norms值。因为为了加快norms的搜索速度,Lucene是根据文档号乘以每篇文档的norms信息所占用的大小来计算偏移量的,中间少一篇文档,偏移量将无法计算。也即norms信息要么都保存,要么都不保存。
2. 域存储选项
Field.Store.*
YES:将会存储域值,原始字符串的值会保存在索引,以此可以进行相应的恢复操作,对于主键,标题可以是这种方式存储
NO:不会存储域值,通常与Index.ANAYLIZED合起来使用,索引一些如文章正文等不需要恢复的文档
3.最佳实践
NOT_ANALYZED_NOT_NORMS |
YES |
标识符(主键、文件名),电话号码,***号,姓名,日期 |
ANAYLZED |
YES |
文档标题和摘要 |
ANAYLZED |
NO |
文档正文 |
NO |
YES |
文档类型,数据库主键(不进行索引) |
NOT_ANALYZED |
NO |
隐藏关键字 |
2.4 其他知识
1. 对数字和日期进行索引
(1)、对数字进行索引可以使用分词器进行不同的索引
·WhitespaceAnalyzer和StandardAnalyzer会索引数字
·SimpleAnalyzer和StopAnalyzer不会索引数字
(2)、在3.0之后添加了数字域来完成数字和日期的索引
1
2
|
doc.add(
new
NumericField(
"size"
, Field.Store.YES,
true
).setLongValue(f.length()));
doc.add(
new
NumericField(
"date"
, Field.Store.YES,
true
).setLongValue(f.lastModified()));
|
2.常用的Directory
FSDDirectory.open会根据当前的运行环境打开一个最合理的基于File的Directory
new RAMDirectory()会从内存中打开directory,好处是速度快,缺点是无法持久化
3. IndexReader和IndexWriter的生命周期
对于IndexReader而言,反复使用Index.open打开会有很大的开销,所以一般在整个程序的生命周期中只会打开一个IndexReader,通过这个IndexReader来创建不同的IndexSearcher,如果使用单例模式,可能出现的问题有:
(1)、当使用Writer修改了索引之后不会更新信息,所以需要使用IndexReader.openIfChange方法操作
如果IndexWriter在创建完成之后,没有关闭,需要进行commit操作之后才能提交
本文转自 梦朝思夕 51CTO博客,原文链接:http://blog.51cto.com/qiangmzsx/1440487