1.导入jar包
2.创建实体Bean
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| 1package com.zhishang.lucene;
2
3/**
4 * Created by Administrator on 2017/7/8.
5 */
6public class HtmlBean {
7 private String title;
8 private String content;
9 private String url;
10
11 public void setTitle(String title) {
12 this.title = title;
13 }
14
15 public void setContent(String content) {
16 this.content = content;
17 }
18
19 public void setUrl(String url) {
20 this.url = url;
21 }
22
23 public String getTitle() {
24 return title;
25 }
26
27 public String getContent() {
28 return content;
29 }
30
31 public String getUrl() {
32 return url;
33 }
34}
35 |
3.创建工具Bean
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| 1package com.zhishang.lucene;
2
3import net.htmlparser.jericho.Element;
4import net.htmlparser.jericho.HTMLElementName;
5import net.htmlparser.jericho.Source;
6import org.junit.Test;
7
8import java.io.File;
9import java.io.IOException;
10
11/**
12 * Created by Administrator on 2017/7/8.
13 */
14public class HtmlBeanUtil {
15
16
17 public static HtmlBean parseHtml(File file){
18 try {
19 Source sc = new Source(file);
20 Element element = sc.getFirstElement(HTMLElementName.TITLE);
21 if (element == null || element.getTextExtractor() == null){
22 return null;
23 }
24
25 HtmlBean htmlBean = new HtmlBean();
26 htmlBean.setTitle(element.getTextExtractor().toString());
27 htmlBean.setContent(sc.getTextExtractor().toString());
28 htmlBean.setUrl(file.getAbsolutePath());
29
30 return htmlBean;
31 } catch (IOException e) {
32 e.printStackTrace();
33 }
34
35 return null;
36 }
37}
38 |
4.创建操作Bean
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
| 1package com.zhishang.lucene;
2
3import org.apache.commons.io.FileUtils;
4import org.apache.commons.io.filefilter.TrueFileFilter;
5import org.apache.lucene.analysis.Analyzer;
6import org.apache.lucene.analysis.standard.StandardAnalyzer;
7import org.apache.lucene.document.*;
8import org.apache.lucene.index.IndexWriter;
9import org.apache.lucene.index.IndexWriterConfig;
10import org.apache.lucene.store.Directory;
11import org.apache.lucene.store.FSDirectory;
12import org.apache.lucene.store.RAMDirectory;
13import org.apache.lucene.util.Version;
14import org.junit.Test;
15import org.wltea.analyzer.lucene.IKAnalyzer;
16
17import java.io.File;
18import java.io.IOException;
19import java.util.Collection;
20
21/**
22 * Created by Administrator on 2017/7/7.
23 */
24public class CreateIndex {
25 public static final String indexDir = "G:/index";
26 public static final String dataDir = "G:/data";
27
28 public void createIndex(){
29 try {
30 Directory dir = FSDirectory.open(new File(indexDir));
31 //分词器
32 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);
33 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,analyzer);
34 config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
35 IndexWriter writer = new IndexWriter(dir,config);
36 File file = new File(dataDir);
37
38 RAMDirectory ramdir = new RAMDirectory();
39 Analyzer analyzer1 = new IKAnalyzer();
40 IndexWriterConfig config1 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer1);
41 IndexWriter ramWriter = new IndexWriter(ramdir,config1);
42
43 Collection<File> files = FileUtils.listFiles(file, TrueFileFilter.INSTANCE,TrueFileFilter.INSTANCE);
44 int count = 0;
45 for(File f:files){
46 HtmlBean bean = HtmlBeanUtil.parseHtml(f);
47 if(bean != null){
48 Document document = new Document();
49 document.add(new StringField("title",bean.getTitle(), Field.Store.YES));
50 document.add(new TextField("content",bean.getContent(), Field.Store.YES));
51 document.add(new StringField("url",bean.getUrl(), Field.Store.YES));
52 ramWriter.addDocument(document);
53 count++;
54 if (count == 50){
55 ramWriter.close();
56 writer.addIndexes(ramdir);
57 ramdir = new RAMDirectory();
58 Analyzer analyzer2 = new IKAnalyzer();
59 IndexWriterConfig config2 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer2);
60 ramWriter = new IndexWriter(ramdir,config2);
61 count = 0;
62 }
63
64 }
65 }
66 writer.close();
67 } catch (IOException e) {
68 e.printStackTrace();
69 }
70
71 }
72}
73 |
5.创建测试Bean
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| 1package com.zhishang.lucene;
2
3import org.apache.lucene.analysis.Analyzer;
4import org.apache.lucene.analysis.standard.StandardAnalyzer;
5import org.apache.lucene.index.IndexWriter;
6import org.apache.lucene.index.IndexWriterConfig;
7import org.apache.lucene.store.Directory;
8import org.apache.lucene.store.FSDirectory;
9import org.apache.lucene.util.Version;
10import org.junit.Test;
11
12import java.io.File;
13
14/**
15 * Created by Administrator on 2017/7/8.
16 */
17public class LuceneBean {
18
19 /*
20 创建索引
21 */
22 @Test
23 public void createIndex(){
24 File file = new File(CreateIndex.indexDir);
25 if (file.exists()){
26 file.delete();
27 file.mkdirs();
28 }
29 CreateIndex createIndex = new CreateIndex();
30 createIndex.createIndex();
31 }
32}
33 |
6.查看生成的索引文件
本文转自 素颜猪 51CTO博客,原文链接:http://blog.51cto.com/suyanzhu/1945466