【Lucene4.8教程之一】使用Lucene4.8进行索引及搜索的基本操作

释放双眼，带上耳机，听听看~！

在Lucene对文本进行处理的过程中，可以大致分为三大部分：

1、索引文件：提取文档内容并分析，生成索引

2、搜索内容：搜索索引内容，根据搜索关键字得出搜索结果

3、分析内容：对搜索词汇进行分析，生成Quey对象。

注：事实上，除了最基本的完全匹配搜索以外，其它都需要在搜索前进行分析。

如不加分析步骤，则搜索JAVA，是没有结果的，因为在索引过程中已经将词汇均转化为小写，而此处搜索时则要求关键字完全匹配。

使用了QueryParser类以后，则根据Analyzer的具体实现类，对搜索词汇进行分析，如大小写转换，java and ant等的搜索词解释等。

一、索引文件

基本步骤如下：

1、创建索引库IndexWriter

2、根据文件创建文档Document

3、向索引库中写入文档内容


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
1package com.ljh.search.index;

2

3import java.io.File;

4import java.io.FileReader;

5import java.io.IOException;

6

7import org.apache.lucene.analysis.standard.StandardAnalyzer;

8import org.apache.lucene.document.Document;

9import org.apache.lucene.document.Field;

10import org.apache.lucene.document.LongField;

11import org.apache.lucene.document.StringField;

12import org.apache.lucene.document.TextField;

13import org.apache.lucene.index.IndexWriter;

14import org.apache.lucene.index.IndexWriterConfig;

15import org.apache.lucene.store.Directory;

16import org.apache.lucene.store.FSDirectory;

17import org.apache.lucene.util.Version;

18

19// 1、创建索引库IndexWriter

20// 2、根据文件创建文档Document

21// 3、向索引库中写入文档内容

22

23public class IndexFiles {

24

25  public static void main(String[] args) throws IOException {

26

27      String usage = &quot;java IndexFiles&quot;

28              + &quot; [-index INDEX_PATH] [-docs DOCS_PATH] \n\n&quot;

29              + &quot;This indexes the documents in DOCS_PATH, creating a Lucene index&quot;

30              + &quot;in INDEX_PATH that can be searched with SearchFiles&quot;;

31

32      String indexPath = null;

33      String docsPath = null;

34      for (int i = 0; i &lt; args.length; i++) {

35          if (&quot;-index&quot;.equals(args[i])) {

36              indexPath = args[i + 1];

37              i++;

38          } else if (&quot;-docs&quot;.equals(args[i])) {

39              docsPath = args[i + 1];

40              i++;

41          }

42      }

43

44      if (docsPath == null) {

45          System.err.println(&quot;Usage: &quot; + usage);

46          System.exit(1);

47      }

48

49      final File docDir = new File(docsPath);

50      if (!docDir.exists() || !docDir.canRead()) {

51          System.out

52                  .println(&quot;Document directory &#x27;&quot;

53                          + docDir.getAbsolutePath()

54                          + &quot;&#x27; does not exist or is not readable, please check the path&quot;);

55          System.exit(1);

56      }

57

58      IndexWriter writer = null;

59      try {

60          // 1、创建索引库IndexWriter

61          writer = getIndexWriter(indexPath);

62          index(writer, docDir);

63      } catch (IOException e) {

64          e.printStackTrace();

65      } finally {

66          writer.close();

67      }

68

69  }

70

71  private static IndexWriter getIndexWriter(String indexPath)

72          throws IOException {

73

74      Directory indexDir = FSDirectory.open(new File(indexPath));

75

76      IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,

77              new StandardAnalyzer(Version.LUCENE_48));

78

79      IndexWriter writer = new IndexWriter(indexDir, iwc);

80

81      return writer;

82  }

83

84  private static void index(IndexWriter writer, File file) throws IOException {

85

86      if (file.isDirectory()) {

87          String[] files = file.list();

88          if (files != null) {

89              for (int i = 0; i &lt; files.length; i++) {

90                  index(writer, new File(file, files[i]));

91              }

92          }

93      } else {

94          // 2、根据文件创建文档Document

95          Document doc = new Document();

96          Field pathField = new StringField(&quot;path&quot;, file.getPath(),

97                  Field.Store.YES);

98          doc.add(pathField);

99          doc.add(new LongField(&quot;modified&quot;, file.lastModified(),

100                 Field.Store.NO));

101         doc.add(new TextField(&quot;contents&quot;, new FileReader(file)));

102         System.out.println(&quot;Indexing &quot; + file.getName());

103         

104         // 3、向索引库中写入文档内容

105         writer.addDocument(doc);

106     }

107

108 }

109

110}

111

112

（1）使用“java indexfiles -index d:/index -docs d:/tmp”运行程序，索引d:/tmp中的文件，并将索引文件放置到d:/index。

（2）上述生成的索引文件可以使用Luke进行查看。目前Luke已迁移至github进行托管。
二、搜索文件

1、打开索引库IndexSearcher
2、根据关键词进行搜索
3、遍历结果并处理


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
1package com.ljh.search.search;

2

3//1、打开索引库IndexSearcher

4//2、根据关键词进行搜索

5//3、遍历结果并处理

6import java.io.File;

7import java.io.IOException;

8

9import org.apache.lucene.index.DirectoryReader;

10import org.apache.lucene.index.IndexReader;

11import org.apache.lucene.index.Term;

12import org.apache.lucene.search.IndexSearcher;

13import org.apache.lucene.search.ScoreDoc;

14import org.apache.lucene.search.TermQuery;

15import org.apache.lucene.search.TopDocs;

16import org.apache.lucene.store.Directory;

17import org.apache.lucene.store.FSDirectory;

18

19public class Searcher {

20  public static void main(String[] args) throws IOException {

21

22      String indexPath = null;

23      String term = null;

24      for (int i = 0; i &lt; args.length; i++) {

25          if (&quot;-index&quot;.equals(args[i])) {

26              indexPath = args[i + 1];

27              i++;

28          } else if (&quot;-term&quot;.equals(args[i])) {

29              term = args[i + 1];

30              i++;

31          }

32      }

33

34      System.out.println(&quot;Searching &quot; + term + &quot; in &quot; + indexPath);

35

36      // 1、打开索引库

37      Directory indexDir = FSDirectory.open(new File(indexPath));

38      IndexReader ir = DirectoryReader.open(indexDir);

39      IndexSearcher searcher = new IndexSearcher(ir);

40

41      // 2、根据关键词进行搜索

42      TopDocs docs = searcher.search(

43              new TermQuery(new Term(&quot;contents&quot;, term)), 20);

44

45      // 3、遍历结果并处理

46      ScoreDoc[] hits = docs.scoreDocs;

47      System.out.println(hits.length);

48      for (ScoreDoc hit : hits) {

49          System.out.println(&quot;doc: &quot; + hit.doc + &quot; score: &quot; + hit.score);

50      }

51

52      ir.close();

53

54  }

55

56}

57

58

三、分析

事实上，除了最基本的完全匹配搜索以外，其它都需要在搜索前进行分析。

如不加分析步骤，则搜索JAVA，是没有结果的，因为在索引过程中已经将词汇均转化为小写，而此处搜索时则要求关键字完全匹配。

使用了QueryParser类以后，则根据Analyzer的具体实现类，对搜索词汇进行分析，如大小写转换，java and ant等的搜索词解释等。
分析过程有2个基本步骤：

1、生成QueryParser对象

2、调用QueryParser.parse()生成Query()对象。

具体代码，将下述代码：


1
2
3
4
1       // 2、根据关键词进行搜索

2       TopDocs docs = searcher.search(

3               new TermQuery(new Term(&quot;contents&quot;, term)), 20);

4


1
2
1用以下代替： 

2


1
2
3
4
5
6
7
8
9
10
11
12
1       // 2、根据关键词进行搜索

2       /*TopDocs docs = searcher.search(

3               new TermQuery(new Term(&quot;contents&quot;, term)), 10);*/

4       QueryParser parser = new QueryParser(Version.LUCENE_48, &quot;contents&quot;, new SimpleAnalyzer(Version.LUCENE_48));

5       Query query = null;

6       try {

7           query = parser.parse(term);

8       } catch (ParseException e) {

9           e.printStackTrace();

10      }

11      TopDocs docs = searcher.search(query, 30);

12

{{userData.name}}已认证

【Lucene4.8教程之一】使用Lucene4.8进行索引及搜索的基本操作

OpenSSH-8.7p1离线升级修复安全漏洞

设计模式的设计原则

{{userData.name}}已认证

Related posts:

OpenSSH-8.7p1离线升级修复安全漏洞

设计模式的设计原则

DevOps基础-1.3-DevOps的原则：三大方法

给 DevOps 初学者的入门指南

kubernetes+ prometheus自动伸缩的设计与实现(一)

Nginx日志分析