-
Notifications
You must be signed in to change notification settings - Fork 57
Lucene&Solr索引模块
在dcd.academic.index这个package下有ResearcherIndex,PublicationIndex和PaperIndex三个建立索引的程序,分别对应的是学者元数据,论文元数据和论文全文的索引建立,并且三份索引分别建立在solr的multicore配置下的三个core内,以提供不同的搜索服务。以ResearcherIndex.java为例:
package dcd.academic.index;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import dcd.academic.model.ResearcherModel;
import dcd.academic.util.StdOutUtil;
import com.mongodb.BasicDBList;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
public class ResearcherIndex {
private String indexPath;
private IKAnalyzer analyzer = new IKAnalyzer(true);
public ResearcherIndex(String indexPath) {
this.indexPath = indexPath;
}
public void build() throws IOException {
FSDirectory directory = null;
IndexWriterConfig conf = null;
IndexWriter writer = null;
directory = FSDirectory.open(new File(indexPath));
conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
writer = new IndexWriter(directory, conf);
// data from mongodb
MongoClient mongoClient = new MongoClient("localhost", 30000);
DB db = mongoClient.getDB("academic");
DBCollection coll = db.getCollection("researchers");
DBCursor cursor = coll.find();
try {
while (cursor.hasNext()) {
ResearcherModel model = new ResearcherModel();
DBObject obj = cursor.next();
model.setName((String) obj.get("name"));
model.setWorkplace((String) obj.get("workplace"));
model.setHomepage((String) obj.get("homepage"));
if ((String) obj.get("picurl") != null) {
model.setPicurl((String) obj.get("picurl"));
} else {
model.setPicurl("");
}
if ((String) obj.get("moretags") != null) {
model.setMoretags((String) obj.get("moretags"));
} else {
model.setMoretags("");
}
BasicDBList list = (BasicDBList) obj.get("field");
String s = "";
for (int i = 0; i < list.size(); i ++) {
if (list.get(i).toString().length() > 1) {
String tmp = (String) list.get(i);
tmp = tmp.replace("&", "");
if (s == "") {
s = tmp;
} else {
s = s + ", " + tmp;
}
}
}
StdOutUtil.out(s);
model.setField(s);
Document doc = new Document();
doc.add(new Field("name", model.getName(), Store.YES, Index.ANALYZED));
doc.add(new Field("workplace", model.getWorkplace(), Store.YES, Index.ANALYZED));
doc.add(new Field("homepage", model.getHomepage(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("field", model.getField(), Store.YES, Index.ANALYZED));
doc.add(new Field("picurl", model.getPicurl(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("moretags", model.getMoretags(), Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
}
} finally {
cursor.close();
}
writer.close();
}
public static void main(String[] args) throws IOException {
ResearcherIndex ri = new ResearcherIndex("E://softs2/apache-solr-3.6.2/example/multicore/core0/data/index");
ri.build();
}
}
代码很简单,就是从mongodb的researchers这个collection里读取学者元数据,而
doc.add(new Field("name", model.getName(), Store.YES, Index.ANALYZED));
doc.add(new Field("workplace", model.getWorkplace(), Store.YES, Index.ANALYZED));
doc.add(new Field("homepage", model.getHomepage(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("field", model.getField(), Store.YES, Index.ANALYZED));
doc.add(new Field("picurl", model.getPicurl(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("moretags", model.getMoretags(), Store.YES, Index.NOT_ANALYZED));
显示的是字段的索引情况。这样的索引块单独是可以让lucene提供搜索的。但如果要使用solr来做这个搜索服务的话,需要在solr的schema.xml里对应设置好这几个字段的统一配置:
<field name="name" type="text_ik" indexed="true" stored="true" />
<field name="workplace" type="text_ik" indexed="true" stored="true" />
<field name="homepage" type="text_ik" indexed="false" stored="true" />
<field name="field" type="text_ik" indexed="true" stored="true" />
<field name="picurl" type="text_ik" indexed="false" stored="true" />
<field name="moretags" type="text_ik" indexed="false" stored="true" />
我项目中的lucene部分就完全是这三个.java的工作量,如果不实用solr,那么就可以直接拿来做搜索了。如果使用solr的话,请看下一小节具体我对solr的配置。
我使用的solr版本是3.6.2,4.0以上的solr支持solr cloud。Solr Server单独要一个tomcat启动,我开启的是9080端口,和web项目在同一台机器上,web的tomcat还是8080端口。Solr使用的是multicore配置,core0提供学者元数据搜索,core1提供论文元数据搜索,core2提供论文全文搜索。三者有各自单独的配置,不过大同小异,主要都是在solrconfig.xml和schema.xml两个主要配置文件里,区别就在于三个core里面索引字段的配置区别(配置在schema.xml内)。
相同配置在于在schema.xml里都配置了使用第三方IKAnalyzer分词器。要加上以下配置:
<fieldType name="text_ik" class="solr.TextField">
<analyzer type="index">
<tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory" isMaxWordLength="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory" isMaxWordLength="false" useSmart="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
在定义字段的时候,只要在type处使用text_ik就可以使用IK切词了,当然IK的jar包要放在tomcat webapps的solr项目的web lib里面。
在schema.xml里,每个core的索引字段配置不同
学者元数据core0:
<field name="name" type="text_ik" indexed="true" stored="true" />
<field name="workplace" type="text_ik" indexed="true" stored="true" />
<field name="homepage" type="text_ik" indexed="false" stored="true" />
<field name="field" type="text_ik" indexed="true" stored="true" />
<field name="picurl" type="text_ik" indexed="false" stored="true" />
<field name="moretags" type="text_ik" indexed="false" stored="true" />
论文元数据core1:
<field name="title" type="text_ik" indexed="true" stored="true" />
<field name="pub_abstract" type="text_ik" indexed="true" stored="true" />
<field name="conference" type="text_ik" indexed="true" stored="true" />
<field name="view_url" type="text_ik" indexed="false" stored="true" />
<field name="author" type="text_ik" indexed="true" stored="true" />
论文全文core2:
<field name="title" type="text_ik" indexed="true" stored="true" />
<field name="name" type="text_ik" indexed="true" stored="true" />
<field name="text" type="text_ik" indexed="true" stored="false" />
<field name="url" type="text_ik" indexed="false" stored="true" />
把solr本身的war丢到tomcat的webapps下之后,需要在tomcat的conf/Catalina/localhost下添加一个solr.xml配置文件:
<Context docBase="E:\softs2\apache-tomcat-6.0.36\webapps\solr.war" debug="0" crossContext="true" >
<Environment name="solr/home" type="java.lang.String" value="E:\softs2\apache-solr-3.6.2\example\multicore" override="true" />
</Context>
作用是指明webapps下的war包对应的本地的solr home的位置,因为真正solr搜索相关的配置都在multicore里面。
如果想配置solrconfig.xml的话,主要配置/select这个requestHandler里的一些设置就可以了,比如:
<requestHandler name="/select" class="solr.SearchHandler">
<lst name="defaults">
<str name="echoParams">explicit</str>
<int name="rows">10</int>
<str name="df">name</str>
<bool name="hl">true</bool>
<str name="hl.fl">name</str>
<int name="hl.snippets">3</int>
</lst>
</requestHandler>
配置的是返回行数,df是默认搜索域,hl是高亮,hl.fl指name是默认高亮域。