• R/O
  • SSH
  • HTTPS

fess: Commit


Commit MetaInfo

Revision1606 (tree)
Time2013-03-03 17:19:00
Authoryfujita

Log Message

fixed #30879

Change Summary

Incremental Difference

--- fess-solr-plugin/trunk/src/main/java/jp/sf/fess/solr/plugin/analysis/SuggestTitleTokenizerFactory.java (nonexistent)
+++ fess-solr-plugin/trunk/src/main/java/jp/sf/fess/solr/plugin/analysis/SuggestTitleTokenizerFactory.java (revision 1606)
@@ -0,0 +1,126 @@
1+/*
2+ * Copyright 2009-2013 the Fess Project and the Others.
3+ *
4+ * Licensed under the Apache License, Version 2.0 (the "License");
5+ * you may not use this file except in compliance with the License.
6+ * You may obtain a copy of the License at
7+ *
8+ * http://www.apache.org/licenses/LICENSE-2.0
9+ *
10+ * Unless required by applicable law or agreed to in writing, software
11+ * distributed under the License is distributed on an "AS IS" BASIS,
12+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13+ * either express or implied. See the License for the specific language
14+ * governing permissions and limitations under the License.
15+ */
16+
17+package jp.sf.fess.solr.plugin.analysis;
18+
19+import java.io.InputStream;
20+import java.io.InputStreamReader;
21+import java.io.Reader;
22+import java.nio.charset.Charset;
23+import java.nio.charset.CharsetDecoder;
24+import java.nio.charset.CodingErrorAction;
25+import java.util.List;
26+import java.util.Locale;
27+import java.util.Map;
28+
29+import jp.sf.fess.solr.plugin.suggest.SuggestConverterCreator;
30+import jp.sf.fess.suggest.converter.SuggestConverter;
31+
32+import org.apache.lucene.analysis.ja.JapaneseTokenizer;
33+import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
34+import org.apache.lucene.analysis.ja.dict.UserDictionary;
35+import org.apache.lucene.analysis.util.ResourceLoader;
36+import org.apache.lucene.analysis.util.ResourceLoaderAware;
37+import org.apache.lucene.analysis.util.TokenizerFactory;
38+import org.apache.lucene.util.IOUtils;
39+import org.slf4j.Logger;
40+import org.slf4j.LoggerFactory;
41+
42+public class SuggestTitleTokenizerFactory extends TokenizerFactory implements
43+ ResourceLoaderAware {
44+
45+ private static final Logger logger = LoggerFactory
46+ .getLogger(SuggestTitleTokenizerFactory.class);
47+
48+ private static final String MODE = "mode";
49+
50+ private static final String USER_DICT_PATH = "userDictionary";
51+
52+ private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
53+
54+ private static final String BUFFER_SIZE = "bufferSize";
55+
56+ private static final String WORD_SEPARATOR = "wordSeparator";
57+
58+ private UserDictionary userDictionary;
59+
60+ private Mode mode;
61+
62+ private boolean discardPunctuation;
63+
64+ private int bufferSize;
65+
66+ private String wordSeparator;
67+
68+ private List<SuggestConverter> preConverterList;
69+
70+ private List<SuggestConverter> converterList;
71+
72+ @Override
73+ public SuggestTitleTokenizer create(final Reader input) {
74+ return new SuggestTitleTokenizer(input, bufferSize, userDictionary,
75+ discardPunctuation, mode, preConverterList, converterList,
76+ wordSeparator);
77+ }
78+
79+ @Override
80+ public void inform(final ResourceLoader loader) {
81+ try {
82+ mode = getMode(args);
83+ final String userDictionaryPath = args.get(USER_DICT_PATH);
84+ if (userDictionaryPath != null) {
85+ final InputStream stream = loader
86+ .openResource(userDictionaryPath);
87+ String encoding = args.get(USER_DICT_ENCODING);
88+ if (encoding == null) {
89+ encoding = IOUtils.UTF_8;
90+ }
91+ final CharsetDecoder decoder = Charset.forName(encoding)
92+ .newDecoder()
93+ .onMalformedInput(CodingErrorAction.REPORT)
94+ .onUnmappableCharacter(CodingErrorAction.REPORT);
95+ final Reader reader = new InputStreamReader(stream, decoder);
96+ userDictionary = new UserDictionary(reader);
97+ } else {
98+ userDictionary = null;
99+ }
100+ discardPunctuation = true;
101+
102+ bufferSize = getInt(BUFFER_SIZE, 256);
103+ wordSeparator = args.get(WORD_SEPARATOR);
104+ if (wordSeparator == null) {
105+ wordSeparator = "_SP_";
106+ }
107+
108+ preConverterList = SuggestConverterCreator.create(args
109+ .get("preConverters"));
110+ converterList = SuggestConverterCreator.create(args
111+ .get("converters"));
112+ } catch (final Exception e) {
113+ logger.warn("Initialization failed.", e);
114+ }
115+ }
116+
117+ private Mode getMode(final Map<String, String> args) {
118+ final String mode = args.get(MODE);
119+ if (mode != null) {
120+ return Mode.valueOf(mode.toUpperCase(Locale.ROOT));
121+ } else {
122+ return JapaneseTokenizer.Mode.NORMAL;
123+ }
124+ }
125+
126+}
--- fess-solr-plugin/trunk/src/main/java/jp/sf/fess/solr/plugin/analysis/SuggestTitleTokenizer.java (nonexistent)
+++ fess-solr-plugin/trunk/src/main/java/jp/sf/fess/solr/plugin/analysis/SuggestTitleTokenizer.java (revision 1606)
@@ -0,0 +1,223 @@
1+/*
2+ * Copyright 2009-2013 the Fess Project and the Others.
3+ *
4+ * Licensed under the Apache License, Version 2.0 (the "License");
5+ * you may not use this file except in compliance with the License.
6+ * You may obtain a copy of the License at
7+ *
8+ * http://www.apache.org/licenses/LICENSE-2.0
9+ *
10+ * Unless required by applicable law or agreed to in writing, software
11+ * distributed under the License is distributed on an "AS IS" BASIS,
12+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13+ * either express or implied. See the License for the specific language
14+ * governing permissions and limitations under the License.
15+ */
16+
17+package jp.sf.fess.solr.plugin.analysis;
18+
19+import java.io.IOException;
20+import java.io.Reader;
21+import java.io.StringReader;
22+import java.util.ArrayList;
23+import java.util.List;
24+
25+import jp.sf.fess.suggest.converter.SuggestConverter;
26+
27+import org.apache.commons.io.IOUtils;
28+import org.apache.lucene.analysis.TokenStream;
29+import org.apache.lucene.analysis.Tokenizer;
30+import org.apache.lucene.analysis.ja.JapaneseTokenizer;
31+import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
32+import org.apache.lucene.analysis.ja.dict.UserDictionary;
33+import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
34+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
35+import org.slf4j.Logger;
36+import org.slf4j.LoggerFactory;
37+
38+import com.ibm.icu.text.Transliterator;
39+
40+public class SuggestTitleTokenizer extends Tokenizer {
41+ private static final Logger logger = LoggerFactory
42+ .getLogger(SuggestTitleTokenizer.class);
43+
44+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
45+
46+ private String inputStr = "";
47+
48+ private int offset = 0;
49+
50+ private final List<String> termListByKuromoji = new ArrayList<String>();
51+
52+ private final List<String> readingList = new ArrayList<String>();
53+
54+ private String[] titleArray = null;
55+
56+ private final UserDictionary userDictionary;
57+
58+ private final boolean discardPunctuation;
59+
60+ private final Mode tokenizerMode;
61+
62+ private final String wordSeparator;
63+
64+ private final List<SuggestConverter> preConverterList;
65+
66+ private final List<SuggestConverter> converterList;
67+
68+ public SuggestTitleTokenizer(final Reader input, final int bufferSize,
69+ final UserDictionary userDictionaryPara,
70+ final boolean discardPunctuationPara, final Mode modePara,
71+ final List<SuggestConverter> preconverterList,
72+ final List<SuggestConverter> converterList,
73+ final String wordSeparator) {
74+ super(input);
75+
76+ userDictionary = userDictionaryPara;
77+ discardPunctuation = discardPunctuationPara;
78+ tokenizerMode = modePara;
79+ termAtt.resizeBuffer(bufferSize);
80+ this.wordSeparator = wordSeparator;
81+ preConverterList = preconverterList;
82+ this.converterList = converterList;
83+
84+ initialize();
85+ }
86+
87+ public void initialize() {
88+ termListByKuromoji.clear();
89+ readingList.clear();
90+ titleArray = null;
91+ offset = 0;
92+ inputStr = "";
93+
94+ try {
95+ final String s = IOUtils.toString(input);
96+ if (s != null && s.length() > 0) {
97+ inputStr = s;
98+ for (final SuggestConverter converter : preConverterList) {
99+ inputStr = converter.convert(inputStr);
100+ }
101+ titleArray = inputStr.split("\\$\\{and\\}");
102+ inputStr = inputStr.replace("${and}", " ");
103+ }
104+ } catch (final IOException e) {
105+ }
106+
107+ final Reader rd = new StringReader(inputStr);
108+
109+ TokenStream stream = null;
110+
111+ try {
112+ stream = new JapaneseTokenizer(rd, userDictionary,
113+ discardPunctuation, tokenizerMode);
114+
115+ stream.reset();
116+ while (stream.incrementToken()) {
117+ final CharTermAttribute att = stream
118+ .getAttribute(CharTermAttribute.class);
119+ termListByKuromoji.add(att.toString());
120+
121+ final ReadingAttribute rdAttr = stream
122+ .getAttribute(ReadingAttribute.class);
123+
124+ String reading;
125+ if (rdAttr.getReading() != null) {
126+ reading = rdAttr.getReading();
127+ } else {
128+ reading = att.toString();
129+ }
130+
131+ for (final SuggestConverter converter : converterList) {
132+ reading = converter.convert(reading);
133+ }
134+ readingList.add(reading);
135+
136+ }
137+
138+ } catch (final Exception e) {
139+ logger.warn("JapaneseTokenizer stream error", e);
140+ } finally {
141+ try {
142+ input.reset();
143+ } catch (final Exception e) {
144+ }
145+ try {
146+ stream.end();
147+ } catch (final Exception e) {
148+ }
149+ try {
150+ rd.close();
151+ } catch (final Exception e) {
152+ }
153+ }
154+ }
155+
156+ @Override
157+ public boolean incrementToken() throws IOException {
158+ if (titleArray == null || offset >= titleArray.length) {
159+ return false;
160+ }
161+
162+ termAtt.setEmpty();
163+ termAtt.append(convertSuggestString(titleArray[offset],
164+ getReading(titleArray[offset])));
165+ offset++;
166+ return true;
167+ }
168+
169+ @Override
170+ public void reset() throws IOException {
171+ super.reset();
172+ initialize();
173+ }
174+
175+ private String convertSuggestString(final String term, final String reading) {
176+ String suggestString;
177+ if (reading != null && reading.length() > 0) {
178+ suggestString = reading + wordSeparator + term;
179+ } else {
180+ suggestString = term;
181+ }
182+
183+ return suggestString;
184+ }
185+
186+ private String getReading(final String s) {
187+
188+ final StringBuilder buf = new StringBuilder();
189+
190+ for (int i = 0; i < s.length(); i++) {
191+ String term = "";
192+ int length = 0;
193+
194+ for (int j = 0; j < termListByKuromoji.size(); j++) {
195+ final String tmpStr = termListByKuromoji.get(j);
196+ if (s.substring(i).indexOf(tmpStr) == 0) {
197+ if (tmpStr.length() > term.length()) {
198+ term = readingList.get(j);
199+ length = tmpStr.length();
200+ }
201+ }
202+ }
203+ if (term.length() > 0) {
204+ buf.append(term);
205+ i += length - 1;
206+ } else {
207+ char c = s.charAt(i);
208+
209+ c = Transliterator.getInstance("Hiragana-Katakana")
210+ .transliterate(String.valueOf(c)).charAt(0);
211+
212+ buf.append(c);
213+ }
214+ }
215+
216+ String reading = buf.toString();
217+ for (final SuggestConverter converter : converterList) {
218+ reading = converter.convert(reading);
219+ }
220+
221+ return reading;
222+ }
223+}
Show on old repository browser