Lucene创建索引入门案例

112次阅读

共计 14024 个字符，预计需要花费 36 分钟才能阅读完成。

最近在学习 lucene，参考网上的资料写了一个简单搜索 demo；

项目 jar 包：

Lucene 创建索引入门案例

// 索引关键类

package com.lucene.index;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.lucene.vo.User;

/**
*  * lucene 检索内存索引非常简单的例子  *  * @author Administrator  *  
*/
public class searchIndex {
private String[] ids = { “1”, “2”, “3”, “4”, “5”, “6”};
private String[] emails = { “aa@itat.org”, “bb@itat.org”, “cc@cc.org”, “dd@sina.org”, “ee@zttc.edu”, “ff@itat.org”};
// private String[] contents = { “welcome to visited the space,I like book”, “hello boy, I like pingpeng ball”, “my name is cc I like game”, “I like football”,
// “I like football and I like basketball too”, “I like movie and swim” };
private String[] contents = { “ 创建一个内存目录对象，所以这里生成的索引会放在磁盘中，而不是在内存中 ”, “ 创建索引写入对象，该对象既可以把索引写入到磁盘中也可以写入到内存中 ”, “ 分词器，分词器就是将检索的关键字分割成一组组词组，它是 lucene 检索查询的一大特色之一 ”, “ 这个是分词器拆分最大长度，因为各种不同类型的分词器拆分的字符颗粒细化程度不一样，所以需要设置一个最长的拆分长度 ”,
“ 文档对象，在 lucene 中创建的索引可以看成数据库中的一张表，表中也可以有字段, 往里面添加内容之后可以根据字段去匹配查询 ”, “I like movie and swim” };
private String[] names = { “zhangsan”, “lisi”, “john”, “jetty”, “mike”, “jake”};
// 创建一个内存目录对象，所以这里生成的索引会放在磁盘中，而不是在内存中。
private Directory directory = null;
//IK 分词器
IKAnalyzer analyzer = null;
public searchIndex() {
try {
directory = FSDirectory.open(new File(“H:/lucene/index”));
analyzer = new IKAnalyzer(true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

public void index() {
/*
* 创建索引写入对象，该对象既可以把索引写入到磁盘中也可以写入到内存中。
*/
IndexWriter writer;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, analyzer));
// 创建之前先删除
writer.deleteAll();
// 创建 Document
// 文档对象，在 lucene 中创建的索引可以看成数据库中的一张表，表中也可以有字段, 往里面添加内容之后可以根据字段去匹配查询

Document doc =null;

for(int i=0;i<ids.length;i++){
doc = new Document();
doc.add(new Field(“id”, ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(“email”, emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(“content”, contents[i], Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field(“name”, names[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
writer.addDocument(doc);
}
writer.close();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (LockObtainFailedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

public List<User> search(String keyword) {
long startTime = System.currentTimeMillis();
System.out.println(“***************** 检索开始 **********************”);
List<User> userList = new ArrayList<User>();
IndexReader reader;
try {
reader = IndexReader.open(directory);

// 创建 IndexSearcher 检索索引的对象，里面要传递上面写入的内存目录对象 directory
IndexSearcher searcher = new IndexSearcher(reader);
// 根据搜索关键字封装一个 term 组合对象，然后封装成 Query 查询对象

QueryParser queryParser = new QueryParser(Version.LUCENE_36, “content”, analyzer);
Query query = queryParser.parse(keyword);

// 去索引目录中查询，返回的是 TopDocs 对象，里面存放的就是上面放的 document 文档对象
TopDocs rs = searcher.search(query, null, 10);
long endTime = System.currentTimeMillis();
System.out.println(“ 总共花费 ” + (endTime – startTime) + “ 毫秒，检索到 ” + rs.totalHits + “ 条记录。”);
User user = null;
for (int i = 0; i < rs.scoreDocs.length; i++) {
// rs.scoreDocs[i].doc 是获取索引中的标志位 id, 从 0 开始记录
Document firstHit = searcher.doc(rs.scoreDocs[i].doc);
user = new User();
user.setId(Long.parseLong(firstHit.get(“id”)));
user.setName(firstHit.get(“name”));
user.setSex(firstHit.get(“sex”));
user.setDosomething(firstHit.get(“dosometing”));
user.setEmail(firstHit.get(“email”));
user.setContent(firstHit.get(“content”));
userList.add(user);

// System.out.println(“name:” + firstHit.get(“name”));
// System.out.println(“sex:” + firstHit.get(“sex”));
// System.out.println(“dosomething:” + firstHit.get(“dosometing”));
}
reader.close();
} catch (CorruptIndexException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

System.out.println(“***************** 检索结束 **********************”);
return userList;
}

}

更多详情见请继续阅读下一页的精彩内容 ：http://www.linuxidc.com/Linux/2014-06/103525p2.htm

基于 Lucene 多索引进行索引和搜索 http://www.linuxidc.com/Linux/2012-05/59757.htm

Lucene 实战 (第 2 版) 中文版配套源代码 http://www.linuxidc.com/Linux/2013-10/91055.htm

Lucene 实战 (第 2 版) PDF 高清中文版 http://www.linuxidc.com/Linux/2013-10/91052.htm

使用 Lucene-Spatial 实现集成地理位置的全文检索 http://www.linuxidc.com/Linux/2012-02/53117.htm

Lucene + Hadoop 分布式搜索运行框架 Nut 1.0a9 http://www.linuxidc.com/Linux/2012-02/53113.htm

Lucene + Hadoop 分布式搜索运行框架 Nut 1.0a8 http://www.linuxidc.com/Linux/2012-02/53111.htm

Lucene + Hadoop 分布式搜索运行框架 Nut 1.0a7 http://www.linuxidc.com/Linux/2012-02/53110.htm

Project 2-1: 配置 Lucene, 建立 WEB 查询系统 [Ubuntu 10.10] http://www.linuxidc.com/Linux/2010-11/30103.htm

package com.lucene;

import java.io.IOException;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.lucene.index.searchIndex;
import com.lucene.vo.User;

/**
* Servlet implementation class searchServlet
*/
public class searchServlet extends HttpServlet {
private static final long serialVersionUID = 1L;

/**
* Default constructor.
*/
public searchServlet() {
// TODO Auto-generated constructor stub
}

/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
}

/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
request.setCharacterEncoding(“UTF-8”);
String keyword = request.getParameter(“keyword”);
if(“”.equals(keyword)){
keyword=”0″;
}
searchIndex si = new searchIndex();
si.index();
List<User> userList = si.search(keyword);
request.setAttribute(“userList”, userList);
request.getRequestDispatcher(“search.jsp”).forward(request, response);
}

}

package com.lucene.vo;

public class User {
private Long id;
private String name;
private String sex;
private String dosomething;
private String email;
private String content;

public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getDosomething() {
return dosomething;
}
public void setDosomething(String dosomething) {
this.dosomething = dosomething;
}
public String getEmail() {
return email;
}
public void setEmail(String email) {
this.email = email;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}

}

<%@ page language=”java” contentType=”text/html; charset=UTF-8″ pageEncoding=”UTF-8″%>
<%@taglib uri=”http://java.sun.com/jsp/jstl/core” prefix=”c”%>
<!DOCTYPE html PUBLIC “-//W3C//DTD HTML 4.01 Transitional//EN” “http://www.w3.org/TR/html4/loose.dtd”>
<html>
<head>
<meta http-equiv=”Content-Type” content=”text/html; charset=ISO-8859-1″>
<title>lucene 全文检索 </title>
</head>
<body style=”text-align: center;”>
<form action=”searchServlet.do” method=”post”>
<input type=”text” name=”keyword” /> <input type=”submit” value=” 搜索 ” />
</form>
<div style=”height: 10px”>
</div>
<c:if test=”${not empty userList}”>
<div> 相关信息：</div>
<table border=”1″ align=”center”>
<tr>
<td>ID</td>
<td> 姓名 </td>
<td> 性别 </td>
<td> 邮箱 </td>
<td> 爱好 </td>
<td> 正文 </td>
</tr>
<c:forEach items=”${userList}” var=”user”>
<tr>
<td>${user.id}</td>
<td>${user.name}</td>
<td>${user.sex}</td>
<td>${user.email}</td>
<td>${user.dosomething}</td>
<td>${user.content}</td>
</tr>
</c:forEach>
</table>
</c:if>
</body>
</html>

代码测试：

Lucene 创建索引入门案例

Lucene 的详细介绍 ：请点这里
Lucene 的下载地址 ：请点这里

最近在学习 lucene，参考网上的资料写了一个简单搜索 demo；

项目 jar 包：

Lucene 创建索引入门案例

// 索引关键类

package com.lucene.index;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.lucene.vo.User;

/**
*  * lucene 检索内存索引非常简单的例子  *  * @author Administrator  *  
*/
public class searchIndex {
private String[] ids = { “1”, “2”, “3”, “4”, “5”, “6”};
private String[] emails = { “aa@itat.org”, “bb@itat.org”, “cc@cc.org”, “dd@sina.org”, “ee@zttc.edu”, “ff@itat.org”};
// private String[] contents = { “welcome to visited the space,I like book”, “hello boy, I like pingpeng ball”, “my name is cc I like game”, “I like football”,
// “I like football and I like basketball too”, “I like movie and swim” };
private String[] contents = { “ 创建一个内存目录对象，所以这里生成的索引会放在磁盘中，而不是在内存中 ”, “ 创建索引写入对象，该对象既可以把索引写入到磁盘中也可以写入到内存中 ”, “ 分词器，分词器就是将检索的关键字分割成一组组词组，它是 lucene 检索查询的一大特色之一 ”, “ 这个是分词器拆分最大长度，因为各种不同类型的分词器拆分的字符颗粒细化程度不一样，所以需要设置一个最长的拆分长度 ”,
“ 文档对象，在 lucene 中创建的索引可以看成数据库中的一张表，表中也可以有字段, 往里面添加内容之后可以根据字段去匹配查询 ”, “I like movie and swim” };
private String[] names = { “zhangsan”, “lisi”, “john”, “jetty”, “mike”, “jake”};
// 创建一个内存目录对象，所以这里生成的索引会放在磁盘中，而不是在内存中。
private Directory directory = null;
//IK 分词器
IKAnalyzer analyzer = null;
public searchIndex() {
try {
directory = FSDirectory.open(new File(“H:/lucene/index”));
analyzer = new IKAnalyzer(true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

public void index() {
/*
* 创建索引写入对象，该对象既可以把索引写入到磁盘中也可以写入到内存中。
*/
IndexWriter writer;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, analyzer));
// 创建之前先删除
writer.deleteAll();
// 创建 Document
// 文档对象，在 lucene 中创建的索引可以看成数据库中的一张表，表中也可以有字段, 往里面添加内容之后可以根据字段去匹配查询

Document doc =null;

for(int i=0;i<ids.length;i++){
doc = new Document();
doc.add(new Field(“id”, ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(“email”, emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(“content”, contents[i], Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field(“name”, names[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
writer.addDocument(doc);
}
writer.close();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (LockObtainFailedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

public List<User> search(String keyword) {
long startTime = System.currentTimeMillis();
System.out.println(“***************** 检索开始 **********************”);
List<User> userList = new ArrayList<User>();
IndexReader reader;
try {
reader = IndexReader.open(directory);

// 创建 IndexSearcher 检索索引的对象，里面要传递上面写入的内存目录对象 directory
IndexSearcher searcher = new IndexSearcher(reader);
// 根据搜索关键字封装一个 term 组合对象，然后封装成 Query 查询对象

QueryParser queryParser = new QueryParser(Version.LUCENE_36, “content”, analyzer);
Query query = queryParser.parse(keyword);

// 去索引目录中查询，返回的是 TopDocs 对象，里面存放的就是上面放的 document 文档对象
TopDocs rs = searcher.search(query, null, 10);
long endTime = System.currentTimeMillis();
System.out.println(“ 总共花费 ” + (endTime – startTime) + “ 毫秒，检索到 ” + rs.totalHits + “ 条记录。”);
User user = null;
for (int i = 0; i < rs.scoreDocs.length; i++) {
// rs.scoreDocs[i].doc 是获取索引中的标志位 id, 从 0 开始记录
Document firstHit = searcher.doc(rs.scoreDocs[i].doc);
user = new User();
user.setId(Long.parseLong(firstHit.get(“id”)));
user.setName(firstHit.get(“name”));
user.setSex(firstHit.get(“sex”));
user.setDosomething(firstHit.get(“dosometing”));
user.setEmail(firstHit.get(“email”));
user.setContent(firstHit.get(“content”));
userList.add(user);

// System.out.println(“name:” + firstHit.get(“name”));
// System.out.println(“sex:” + firstHit.get(“sex”));
// System.out.println(“dosomething:” + firstHit.get(“dosometing”));
}
reader.close();
} catch (CorruptIndexException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

System.out.println(“***************** 检索结束 **********************”);
return userList;
}

}