Lucene&Solr索引模块

Lucene建立索引

在dcd.academic.index这个package下有ResearcherIndex，PublicationIndex和PaperIndex三个建立索引的程序，分别对应的是学者元数据，论文元数据和论文全文的索引建立，并且三份索引分别建立在solr的multicore配置下的三个core内，以提供不同的搜索服务。以ResearcherIndex.java为例：

package dcd.academic.index;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import dcd.academic.model.ResearcherModel;
import dcd.academic.util.StdOutUtil;

import com.mongodb.BasicDBList;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;

public class ResearcherIndex {
	private String indexPath;
	private IKAnalyzer analyzer = new IKAnalyzer(true);

	public ResearcherIndex(String indexPath) {
		this.indexPath = indexPath;
	}

	public void build() throws IOException {
		FSDirectory directory = null;
		IndexWriterConfig conf = null;
		IndexWriter writer = null;
		directory = FSDirectory.open(new File(indexPath));
		conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
		conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
		writer = new IndexWriter(directory, conf);
		// data from mongodb
		MongoClient mongoClient = new MongoClient("localhost", 30000);
		DB db = mongoClient.getDB("academic");
		DBCollection coll = db.getCollection("researchers");
		DBCursor cursor = coll.find();
		try {
			while (cursor.hasNext()) {
				ResearcherModel model = new ResearcherModel();
				DBObject obj = cursor.next();
				model.setName((String) obj.get("name"));
				model.setWorkplace((String) obj.get("workplace"));
				model.setHomepage((String) obj.get("homepage"));
				if ((String) obj.get("picurl") != null) {
					model.setPicurl((String) obj.get("picurl"));
				} else {
					model.setPicurl("");
				}
				if ((String) obj.get("moretags") != null) {
					model.setMoretags((String) obj.get("moretags"));
				} else {
					model.setMoretags("");
				}
				
				BasicDBList list = (BasicDBList) obj.get("field");
				String s = "";
				for (int i = 0; i < list.size(); i ++) {
					if (list.get(i).toString().length() > 1) {
						String tmp = (String) list.get(i);
						tmp = tmp.replace("&amp;", "");
						if (s == "") {
							s = tmp;
						} else {
							s = s + ", " + tmp;
						}
					}
				}
				StdOutUtil.out(s);
				model.setField(s);
				
				Document doc = new Document();
				doc.add(new Field("name", model.getName(), Store.YES, Index.ANALYZED));
				doc.add(new Field("workplace", model.getWorkplace(), Store.YES, Index.ANALYZED));
				doc.add(new Field("homepage", model.getHomepage(), Store.YES, Index.NOT_ANALYZED));
				doc.add(new Field("field", model.getField(), Store.YES, Index.ANALYZED));
				doc.add(new Field("picurl", model.getPicurl(), Store.YES, Index.NOT_ANALYZED));
				doc.add(new Field("moretags", model.getMoretags(), Store.YES, Index.NOT_ANALYZED));
				writer.addDocument(doc);
			}
		} finally {
			cursor.close();
		}
		writer.close();
	}

	public static void main(String[] args) throws IOException {
		ResearcherIndex ri = new ResearcherIndex("E://softs2/apache-solr-3.6.2/example/multicore/core0/data/index");
		ri.build();
	}
}

代码很简单，就是从mongodb的researchers这个collection里读取学者元数据，而

doc.add(new Field("name", model.getName(), Store.YES, Index.ANALYZED));
doc.add(new Field("workplace", model.getWorkplace(), Store.YES, Index.ANALYZED));
doc.add(new Field("homepage", model.getHomepage(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("field", model.getField(), Store.YES, Index.ANALYZED));
doc.add(new Field("picurl", model.getPicurl(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("moretags", model.getMoretags(), Store.YES, Index.NOT_ANALYZED));

显示的是字段的索引情况。这样的索引块单独是可以让lucene提供搜索的。但如果要使用solr来做这个搜索服务的话，需要在solr的schema.xml里对应设置好这几个字段的统一配置：

<field name="name" type="text_ik" indexed="true" stored="true" />
<field name="workplace" type="text_ik" indexed="true" stored="true" />
<field name="homepage" type="text_ik" indexed="false" stored="true" />
<field name="field" type="text_ik" indexed="true" stored="true" />
<field name="picurl" type="text_ik" indexed="false" stored="true" />
<field name="moretags" type="text_ik" indexed="false" stored="true" />

我项目中的lucene部分就完全是这三个.java的工作量，如果不实用solr，那么就可以直接拿来做搜索了。如果使用solr的话，请看下一小节具体我对solr的配置。

Solr搜索服务

我使用的solr版本是3.6.2，4.0以上的solr支持solr cloud。Solr Server单独要一个tomcat启动，我开启的是9080端口，和web项目在同一台机器上，web的tomcat还是8080端口。Solr使用的是multicore配置，core0提供学者元数据搜索，core1提供论文元数据搜索，core2提供论文全文搜索。三者有各自单独的配置，不过大同小异，主要都是在solrconfig.xml和schema.xml两个主要配置文件里，区别就在于三个core里面索引字段的配置区别（配置在schema.xml内）。

multicore相同配置

相同配置在于在schema.xml里都配置了使用第三方IKAnalyzer分词器。要加上以下配置：

<fieldType name="text_ik" class="solr.TextField">
    <analyzer type="index">
        <tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory"  isMaxWordLength="true"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.LowerCaseFilterFactory"/>
    </analyzer>

    <analyzer type="query">
        <tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory" isMaxWordLength="false" useSmart="true"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
    </analyzer>  
</fieldType>

在定义字段的时候，只要在type处使用text_ik就可以使用IK切词了，当然IK的jar包要放在tomcat webapps的solr项目的web lib里面。

不同字段配置

在schema.xml里，每个core的索引字段配置不同
学者元数据core0：

<field name="name" type="text_ik" indexed="true" stored="true" />
<field name="workplace" type="text_ik" indexed="true" stored="true" />
<field name="homepage" type="text_ik" indexed="false" stored="true" />
<field name="field" type="text_ik" indexed="true" stored="true" />
<field name="picurl" type="text_ik" indexed="false" stored="true" />
<field name="moretags" type="text_ik" indexed="false" stored="true" />

论文元数据core1:

<field name="title" type="text_ik" indexed="true" stored="true" />
<field name="pub_abstract" type="text_ik" indexed="true" stored="true" />
<field name="conference" type="text_ik" indexed="true" stored="true" />
<field name="view_url" type="text_ik" indexed="false" stored="true" />
<field name="author" type="text_ik" indexed="true" stored="true" />

论文全文core2:

<field name="title" type="text_ik" indexed="true" stored="true" />
<field name="name" type="text_ik" indexed="true" stored="true" />
<field name="text" type="text_ik" indexed="true" stored="false" />
<field name="url" type="text_ik" indexed="false" stored="true" />

Solr与Tomcat配置

把solr本身的war丢到tomcat的webapps下之后，需要在tomcat的conf/Catalina/localhost下添加一个solr.xml配置文件：

<Context docBase="E:\softs2\apache-tomcat-6.0.36\webapps\solr.war" debug="0" crossContext="true" >  
   <Environment name="solr/home" type="java.lang.String" value="E:\softs2\apache-solr-3.6.2\example\multicore" override="true" />  
</Context>

作用是指明webapps下的war包对应的本地的solr home的位置，因为真正solr搜索相关的配置都在multicore里面。

solr其他配置

如果想配置solrconfig.xml的话，主要配置/select这个requestHandler里的一些设置就可以了，比如：

<requestHandler name="/select" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <int name="rows">10</int>
       <str name="df">name</str>
       <bool name="hl">true</bool>  
       <str name="hl.fl">name</str>  
       <int name="hl.snippets">3</int>
     </lst>
</requestHandler>

配置的是返回行数，df是默认搜索域，hl是高亮，hl.fl指name是默认高亮域。

Provide feedback

Saved searches

Use saved searches to filter your results more quickly