fixed #30879
@@ -0,0 +1,126 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.analysis; | |
18 | + | |
19 | +import java.io.InputStream; | |
20 | +import java.io.InputStreamReader; | |
21 | +import java.io.Reader; | |
22 | +import java.nio.charset.Charset; | |
23 | +import java.nio.charset.CharsetDecoder; | |
24 | +import java.nio.charset.CodingErrorAction; | |
25 | +import java.util.List; | |
26 | +import java.util.Locale; | |
27 | +import java.util.Map; | |
28 | + | |
29 | +import jp.sf.fess.solr.plugin.suggest.SuggestConverterCreator; | |
30 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
31 | + | |
32 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
33 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; | |
34 | +import org.apache.lucene.analysis.ja.dict.UserDictionary; | |
35 | +import org.apache.lucene.analysis.util.ResourceLoader; | |
36 | +import org.apache.lucene.analysis.util.ResourceLoaderAware; | |
37 | +import org.apache.lucene.analysis.util.TokenizerFactory; | |
38 | +import org.apache.lucene.util.IOUtils; | |
39 | +import org.slf4j.Logger; | |
40 | +import org.slf4j.LoggerFactory; | |
41 | + | |
42 | +public class SuggestTitleTokenizerFactory extends TokenizerFactory implements | |
43 | + ResourceLoaderAware { | |
44 | + | |
45 | + private static final Logger logger = LoggerFactory | |
46 | + .getLogger(SuggestTitleTokenizerFactory.class); | |
47 | + | |
48 | + private static final String MODE = "mode"; | |
49 | + | |
50 | + private static final String USER_DICT_PATH = "userDictionary"; | |
51 | + | |
52 | + private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; | |
53 | + | |
54 | + private static final String BUFFER_SIZE = "bufferSize"; | |
55 | + | |
56 | + private static final String WORD_SEPARATOR = "wordSeparator"; | |
57 | + | |
58 | + private UserDictionary userDictionary; | |
59 | + | |
60 | + private Mode mode; | |
61 | + | |
62 | + private boolean discardPunctuation; | |
63 | + | |
64 | + private int bufferSize; | |
65 | + | |
66 | + private String wordSeparator; | |
67 | + | |
68 | + private List<SuggestConverter> preConverterList; | |
69 | + | |
70 | + private List<SuggestConverter> converterList; | |
71 | + | |
72 | + @Override | |
73 | + public SuggestTitleTokenizer create(final Reader input) { | |
74 | + return new SuggestTitleTokenizer(input, bufferSize, userDictionary, | |
75 | + discardPunctuation, mode, preConverterList, converterList, | |
76 | + wordSeparator); | |
77 | + } | |
78 | + | |
79 | + @Override | |
80 | + public void inform(final ResourceLoader loader) { | |
81 | + try { | |
82 | + mode = getMode(args); | |
83 | + final String userDictionaryPath = args.get(USER_DICT_PATH); | |
84 | + if (userDictionaryPath != null) { | |
85 | + final InputStream stream = loader | |
86 | + .openResource(userDictionaryPath); | |
87 | + String encoding = args.get(USER_DICT_ENCODING); | |
88 | + if (encoding == null) { | |
89 | + encoding = IOUtils.UTF_8; | |
90 | + } | |
91 | + final CharsetDecoder decoder = Charset.forName(encoding) | |
92 | + .newDecoder() | |
93 | + .onMalformedInput(CodingErrorAction.REPORT) | |
94 | + .onUnmappableCharacter(CodingErrorAction.REPORT); | |
95 | + final Reader reader = new InputStreamReader(stream, decoder); | |
96 | + userDictionary = new UserDictionary(reader); | |
97 | + } else { | |
98 | + userDictionary = null; | |
99 | + } | |
100 | + discardPunctuation = true; | |
101 | + | |
102 | + bufferSize = getInt(BUFFER_SIZE, 256); | |
103 | + wordSeparator = args.get(WORD_SEPARATOR); | |
104 | + if (wordSeparator == null) { | |
105 | + wordSeparator = "_SP_"; | |
106 | + } | |
107 | + | |
108 | + preConverterList = SuggestConverterCreator.create(args | |
109 | + .get("preConverters")); | |
110 | + converterList = SuggestConverterCreator.create(args | |
111 | + .get("converters")); | |
112 | + } catch (final Exception e) { | |
113 | + logger.warn("Initialization failed.", e); | |
114 | + } | |
115 | + } | |
116 | + | |
117 | + private Mode getMode(final Map<String, String> args) { | |
118 | + final String mode = args.get(MODE); | |
119 | + if (mode != null) { | |
120 | + return Mode.valueOf(mode.toUpperCase(Locale.ROOT)); | |
121 | + } else { | |
122 | + return JapaneseTokenizer.Mode.NORMAL; | |
123 | + } | |
124 | + } | |
125 | + | |
126 | +} |
@@ -0,0 +1,223 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.analysis; | |
18 | + | |
19 | +import java.io.IOException; | |
20 | +import java.io.Reader; | |
21 | +import java.io.StringReader; | |
22 | +import java.util.ArrayList; | |
23 | +import java.util.List; | |
24 | + | |
25 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
26 | + | |
27 | +import org.apache.commons.io.IOUtils; | |
28 | +import org.apache.lucene.analysis.TokenStream; | |
29 | +import org.apache.lucene.analysis.Tokenizer; | |
30 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
31 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; | |
32 | +import org.apache.lucene.analysis.ja.dict.UserDictionary; | |
33 | +import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
34 | +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
35 | +import org.slf4j.Logger; | |
36 | +import org.slf4j.LoggerFactory; | |
37 | + | |
38 | +import com.ibm.icu.text.Transliterator; | |
39 | + | |
40 | +public class SuggestTitleTokenizer extends Tokenizer { | |
41 | + private static final Logger logger = LoggerFactory | |
42 | + .getLogger(SuggestTitleTokenizer.class); | |
43 | + | |
44 | + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | |
45 | + | |
46 | + private String inputStr = ""; | |
47 | + | |
48 | + private int offset = 0; | |
49 | + | |
50 | + private final List<String> termListByKuromoji = new ArrayList<String>(); | |
51 | + | |
52 | + private final List<String> readingList = new ArrayList<String>(); | |
53 | + | |
54 | + private String[] titleArray = null; | |
55 | + | |
56 | + private final UserDictionary userDictionary; | |
57 | + | |
58 | + private final boolean discardPunctuation; | |
59 | + | |
60 | + private final Mode tokenizerMode; | |
61 | + | |
62 | + private final String wordSeparator; | |
63 | + | |
64 | + private final List<SuggestConverter> preConverterList; | |
65 | + | |
66 | + private final List<SuggestConverter> converterList; | |
67 | + | |
68 | + public SuggestTitleTokenizer(final Reader input, final int bufferSize, | |
69 | + final UserDictionary userDictionaryPara, | |
70 | + final boolean discardPunctuationPara, final Mode modePara, | |
71 | + final List<SuggestConverter> preconverterList, | |
72 | + final List<SuggestConverter> converterList, | |
73 | + final String wordSeparator) { | |
74 | + super(input); | |
75 | + | |
76 | + userDictionary = userDictionaryPara; | |
77 | + discardPunctuation = discardPunctuationPara; | |
78 | + tokenizerMode = modePara; | |
79 | + termAtt.resizeBuffer(bufferSize); | |
80 | + this.wordSeparator = wordSeparator; | |
81 | + preConverterList = preconverterList; | |
82 | + this.converterList = converterList; | |
83 | + | |
84 | + initialize(); | |
85 | + } | |
86 | + | |
87 | + public void initialize() { | |
88 | + termListByKuromoji.clear(); | |
89 | + readingList.clear(); | |
90 | + titleArray = null; | |
91 | + offset = 0; | |
92 | + inputStr = ""; | |
93 | + | |
94 | + try { | |
95 | + final String s = IOUtils.toString(input); | |
96 | + if (s != null && s.length() > 0) { | |
97 | + inputStr = s; | |
98 | + for (final SuggestConverter converter : preConverterList) { | |
99 | + inputStr = converter.convert(inputStr); | |
100 | + } | |
101 | + titleArray = inputStr.split("\\$\\{and\\}"); | |
102 | + inputStr = inputStr.replace("${and}", " "); | |
103 | + } | |
104 | + } catch (final IOException e) { | |
105 | + } | |
106 | + | |
107 | + final Reader rd = new StringReader(inputStr); | |
108 | + | |
109 | + TokenStream stream = null; | |
110 | + | |
111 | + try { | |
112 | + stream = new JapaneseTokenizer(rd, userDictionary, | |
113 | + discardPunctuation, tokenizerMode); | |
114 | + | |
115 | + stream.reset(); | |
116 | + while (stream.incrementToken()) { | |
117 | + final CharTermAttribute att = stream | |
118 | + .getAttribute(CharTermAttribute.class); | |
119 | + termListByKuromoji.add(att.toString()); | |
120 | + | |
121 | + final ReadingAttribute rdAttr = stream | |
122 | + .getAttribute(ReadingAttribute.class); | |
123 | + | |
124 | + String reading; | |
125 | + if (rdAttr.getReading() != null) { | |
126 | + reading = rdAttr.getReading(); | |
127 | + } else { | |
128 | + reading = att.toString(); | |
129 | + } | |
130 | + | |
131 | + for (final SuggestConverter converter : converterList) { | |
132 | + reading = converter.convert(reading); | |
133 | + } | |
134 | + readingList.add(reading); | |
135 | + | |
136 | + } | |
137 | + | |
138 | + } catch (final Exception e) { | |
139 | + logger.warn("JapaneseTokenizer stream error", e); | |
140 | + } finally { | |
141 | + try { | |
142 | + input.reset(); | |
143 | + } catch (final Exception e) { | |
144 | + } | |
145 | + try { | |
146 | + stream.end(); | |
147 | + } catch (final Exception e) { | |
148 | + } | |
149 | + try { | |
150 | + rd.close(); | |
151 | + } catch (final Exception e) { | |
152 | + } | |
153 | + } | |
154 | + } | |
155 | + | |
156 | + @Override | |
157 | + public boolean incrementToken() throws IOException { | |
158 | + if (titleArray == null || offset >= titleArray.length) { | |
159 | + return false; | |
160 | + } | |
161 | + | |
162 | + termAtt.setEmpty(); | |
163 | + termAtt.append(convertSuggestString(titleArray[offset], | |
164 | + getReading(titleArray[offset]))); | |
165 | + offset++; | |
166 | + return true; | |
167 | + } | |
168 | + | |
169 | + @Override | |
170 | + public void reset() throws IOException { | |
171 | + super.reset(); | |
172 | + initialize(); | |
173 | + } | |
174 | + | |
175 | + private String convertSuggestString(final String term, final String reading) { | |
176 | + String suggestString; | |
177 | + if (reading != null && reading.length() > 0) { | |
178 | + suggestString = reading + wordSeparator + term; | |
179 | + } else { | |
180 | + suggestString = term; | |
181 | + } | |
182 | + | |
183 | + return suggestString; | |
184 | + } | |
185 | + | |
186 | + private String getReading(final String s) { | |
187 | + | |
188 | + final StringBuilder buf = new StringBuilder(); | |
189 | + | |
190 | + for (int i = 0; i < s.length(); i++) { | |
191 | + String term = ""; | |
192 | + int length = 0; | |
193 | + | |
194 | + for (int j = 0; j < termListByKuromoji.size(); j++) { | |
195 | + final String tmpStr = termListByKuromoji.get(j); | |
196 | + if (s.substring(i).indexOf(tmpStr) == 0) { | |
197 | + if (tmpStr.length() > term.length()) { | |
198 | + term = readingList.get(j); | |
199 | + length = tmpStr.length(); | |
200 | + } | |
201 | + } | |
202 | + } | |
203 | + if (term.length() > 0) { | |
204 | + buf.append(term); | |
205 | + i += length - 1; | |
206 | + } else { | |
207 | + char c = s.charAt(i); | |
208 | + | |
209 | + c = Transliterator.getInstance("Hiragana-Katakana") | |
210 | + .transliterate(String.valueOf(c)).charAt(0); | |
211 | + | |
212 | + buf.append(c); | |
213 | + } | |
214 | + } | |
215 | + | |
216 | + String reading = buf.toString(); | |
217 | + for (final SuggestConverter converter : converterList) { | |
218 | + reading = converter.convert(reading); | |
219 | + } | |
220 | + | |
221 | + return reading; | |
222 | + } | |
223 | +} |