added ConlluReader (experimental)
@@ -0,0 +1,428 @@ | ||
1 | +using ChaKi.Entity.Corpora; | |
2 | +using ChaKi.Entity.Corpora.Annotations; | |
3 | +using ChaKi.Entity.Readers; | |
4 | +using System; | |
5 | +using System.Collections.Generic; | |
6 | +using System.IO; | |
7 | +using System.Linq; | |
8 | +using System.Text; | |
9 | + | |
10 | +namespace ChaKi.Service.Readers | |
11 | +{ | |
12 | + public class ConllUReader : CorpusSourceReader | |
13 | + { | |
14 | + private static TagSet DefaultTagSet; | |
15 | + | |
16 | + protected Corpus m_Corpus; | |
17 | + | |
18 | + static ConllUReader() | |
19 | + { | |
20 | + // CONLL用のデフォルトTagSet | |
21 | + DefaultTagSet = new TagSet("CabochaTagSet"); | |
22 | + DefaultTagSet.AddTag(new Tag(Tag.SEGMENT, "Bunsetsu")); | |
23 | + | |
24 | + DefaultTagSet.AddVersion(new TagSetVersion("1", 0, true)); | |
25 | + | |
26 | + CabochaReader.TagSet = null; | |
27 | + } | |
28 | + | |
29 | + public ConllUReader(Corpus corpus, LexiconBuilder lb) | |
30 | + { | |
31 | + throw new NotImplementedException("CONLLU format is not fully implemented yet."); | |
32 | + m_Corpus = corpus; | |
33 | + this.LexiconBuilder = lb; | |
34 | + | |
35 | + // TagSetの初期値はCabocha Defaultとし、インポート中に出現したTagを随時加える. | |
36 | + // 複数ファイルインポートの場合、ファイルを読み込むたびにここを通るが、 | |
37 | + // そのたびごとにstaticなTagSetをリセットすると、Segment, Link等から参照しているTagが | |
38 | + // 次々と異なるものになってしまうので、既にTagSetに存在しているものを変更しないようにする. | |
39 | + if (CabochaReader.TagSet == null) | |
40 | + { | |
41 | + CabochaReader.TagSet = new TagSet(); | |
42 | + } | |
43 | + CabochaReader.TagSet.MergeWith(DefaultTagSet); | |
44 | + } | |
45 | + | |
46 | + public LexiconBuilder LexiconBuilder { get; set; } // 原始Lexicon | |
47 | + | |
48 | + public string EncodingToUse { get; set; } | |
49 | + | |
50 | + public bool FromDictionary { get; set; } // 辞書ファイルを読み込む場合はTrue; テキストを読み込む場合はFalse | |
51 | + private static TagSet TagSet => CabochaReader.TagSet; | |
52 | + | |
53 | + /// <summary> | |
54 | + /// CONLLデータをCorpusに読み込む(係り受け情報はSegment,Linkテーブルに入れる) | |
55 | + /// </summary> | |
56 | + /// <param name="path"></param> | |
57 | + /// <param name="encoding"></param> | |
58 | + public Document ReadFromFileSLA(string path, string encoding) | |
59 | + { | |
60 | + var newdoc = new Document(); | |
61 | + newdoc.FileName = path; | |
62 | + using (var streamReader = new StreamReader(path, Encoding.GetEncoding(encoding))) | |
63 | + { | |
64 | + ReadFromStreamSLA(streamReader, -1, newdoc); | |
65 | + } | |
66 | + return newdoc; | |
67 | + } | |
68 | + | |
69 | + [Obsolete] | |
70 | + public Document ReadFromFile(string path, string encoding) | |
71 | + { | |
72 | + throw new System.NotImplementedException(); | |
73 | + } | |
74 | + | |
75 | + // 現在のDocument | |
76 | + private Document m_CurDoc; | |
77 | + // 現在の文 | |
78 | + private Sentence m_CurSen; | |
79 | + // 現在のSentence番号(通しIDおよびDocument毎のPos) | |
80 | + private int m_CurSenID; | |
81 | + // 現在のChar Position (Document毎のPos) | |
82 | + private int m_CurCharPos; | |
83 | + // 文節データの一時リスト | |
84 | + private CabochaBunsetsuList m_BunsetsuList; | |
85 | + // 現在(最後に読み込んだ)文節 | |
86 | + private CabochaBunsetsu m_CurBunsetsu; | |
87 | + // 現在の文内において、係り先が-1であるような文節のリスト | |
88 | + private List<CabochaBunsetsu> m_CurTerminalBunsetsu; | |
89 | + // 文のDocument単位の出現リスト | |
90 | + private List<Sentence> m_SentencesInDoc; | |
91 | + // 現在の複合語Chunk | |
92 | + private CompositeWordChunk m_CompositeWordChunk; | |
93 | + // Document全体の平文内容 | |
94 | + private StringBuilder m_DocumentTextBuilder; | |
95 | + | |
96 | + // 各単語に1文節を割り当て、係り受けを付与する | |
97 | + private void ProcessOneLine_1(string s, string[] fields) | |
98 | + { | |
99 | + var originalSurface = fields[1]; | |
100 | + Lexeme m = null; | |
101 | + try | |
102 | + { | |
103 | + m = this.LexiconBuilder.AddEntryConll(s, this.FromDictionary, false); | |
104 | + } | |
105 | + catch (Exception) | |
106 | + { | |
107 | + Console.WriteLine(string.Format("Lexeme parse error: {0}", s)); | |
108 | + } | |
109 | + if (m != null) | |
110 | + { | |
111 | + try | |
112 | + { | |
113 | + var f0 = Int32.Parse(fields[0]) - 1; | |
114 | + var f7 = (fields.Length > 7) ? fields[7] : string.Empty; | |
115 | + var f6 = f0 + 1; | |
116 | + if (fields.Length > 6 && fields[6] != "_") | |
117 | + { | |
118 | + f6 = Int32.Parse(fields[6]) - 1; | |
119 | + } | |
120 | + var buns = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurCharPos, f0, f7, f6, 0.0); | |
121 | + buns.EndPos = buns.StartPos + originalSurface.Length; | |
122 | + m_CurBunsetsu = buns; | |
123 | + if (buns.DependsTo == -1) | |
124 | + { | |
125 | + m_CurTerminalBunsetsu.Add(buns); | |
126 | + } | |
127 | + m_BunsetsuList.Add(buns); | |
128 | + } | |
129 | + catch (Exception) | |
130 | + { | |
131 | + Console.WriteLine(string.Format("Bunsetsu parse error: {0}", s)); | |
132 | + } | |
133 | + Word w = null; | |
134 | + var feats = new string[0]; | |
135 | + if (fields.Length > 5 && fields[5] != "_") | |
136 | + { | |
137 | + feats = fields[5].Split('|'); | |
138 | + } | |
139 | + if (feats.Contains("SP")) | |
140 | + { | |
141 | + // FEATURESで空白を指定している場合 | |
142 | + // SentenceとBunsetsuにWordを追加. | |
143 | + w = m_CurSen.AddWord(m); | |
144 | + w.StartChar = m_CurCharPos; | |
145 | + w.EndChar = m_CurCharPos + w.CharLength + 1; | |
146 | + w.Extras = " "; | |
147 | + m_DocumentTextBuilder.Append(originalSurface + w.Extras); | |
148 | + } | |
149 | + else | |
150 | + { | |
151 | + // SentenceとBunsetsuにWordを追加. | |
152 | + w = m_CurSen.AddWord(m); | |
153 | + w.StartChar = m_CurCharPos; | |
154 | + w.EndChar = m_CurCharPos + originalSurface.Length; // Word.Lengthを使ってもよいが、空白を含む文字列長であることに注意. | |
155 | + // Surfaceの末尾の空白をWordに記録 | |
156 | + //if (!originalSurface.Equals(m.Surface)) | |
157 | + //{ | |
158 | + // w.Extras = GetDiff(originalSurface, m.Surface); | |
159 | + //} | |
160 | + m_DocumentTextBuilder.Append(originalSurface); | |
161 | + } | |
162 | + | |
163 | + // 文節にこの語を割り当てる | |
164 | + if (m_CurBunsetsu != null) | |
165 | + { | |
166 | + m_CurBunsetsu.Words.Add(w); | |
167 | + } | |
168 | + m_CurCharPos += (w.EndChar - w.StartChar); | |
169 | + } | |
170 | + } | |
171 | + | |
172 | + public void ReadFromStreamSLA(TextReader rdr, int sentenceCount, Document doc) | |
173 | + { | |
174 | + this.FromDictionary = false; | |
175 | + | |
176 | + string s; | |
177 | + | |
178 | + m_CurDoc = doc; | |
179 | + m_BunsetsuList = new CabochaBunsetsuList(); | |
180 | + m_CurCharPos = 0; | |
181 | + m_CurSenID = 0; | |
182 | + m_CurSen = new Sentence(m_CurDoc) { ID = m_CurSenID++, Pos = 0 }; | |
183 | + m_CurBunsetsu = null; // 最後に読んだ文節 | |
184 | + m_CurTerminalBunsetsu = new List<CabochaBunsetsu>(); // 現在の文内において、係り先が-1であるような文節 | |
185 | + //m_CurSegments = new List<Segment>(); // 今読んでいるSentenceに付属するBunsetsu以外のSegmentリスト | |
186 | + m_DocumentTextBuilder = new StringBuilder(); | |
187 | + object lastAnnotationTag = m_CurDoc; // 現在の文内において、最後に読んだDocumentまたはSegment(文節以外)またはLinkまたはGroup | |
188 | + m_SentencesInDoc = new List<Sentence>(); | |
189 | + //m_SegmentsInDoc = new List<Segment>(); | |
190 | + m_CompositeWordChunk = new CompositeWordChunk(); | |
191 | + | |
192 | + var currentComposite = string.Empty; | |
193 | + | |
194 | + int n = 0; | |
195 | + while (true) | |
196 | + { | |
197 | + s = rdr.ReadLine(); | |
198 | + if (s == null) | |
199 | + { | |
200 | + if (m_CurSen.Words.Count > 0) // 正常なファイル末尾なら、m_CurSenはクリアされているはず。 | |
201 | + { | |
202 | + s = string.Empty;// ファイル末尾にEOS行(空行)がないので、空行の存在をシミュレートする. | |
203 | + } | |
204 | + else | |
205 | + { | |
206 | + break; | |
207 | + } | |
208 | + } | |
209 | + s = Cleanup(s); // ファイル途中のBOM(catした時に残っている場合がある)を削除する | |
210 | + | |
211 | + if (s.StartsWith("#")) | |
212 | + { | |
213 | + // Ignore | |
214 | + } | |
215 | + else if (s.Trim().Length > 0) | |
216 | + { | |
217 | + // 語を表す1行を処理する. | |
218 | + var fields = s.Split('\t'); | |
219 | + try | |
220 | + { | |
221 | + ProcessOneLine_2(s, fields); | |
222 | + } | |
223 | + catch (Exception ex) | |
224 | + { | |
225 | + Console.WriteLine("At line {0}: Error: {1}", n, ex.Message); | |
226 | + } | |
227 | + } | |
228 | + else | |
229 | + { | |
230 | + // CONLL: 空行は文の終わり | |
231 | + if (m_CurBunsetsu == null) | |
232 | + { // デフォルト文節を追加(入力がChasen/Mecabの場合のため) | |
233 | + var buns = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurSen.StartChar, 0, String.Empty, -1, 0.0); | |
234 | + buns.EndPos = m_CurCharPos; | |
235 | + m_BunsetsuList.Add(buns); | |
236 | + m_CurBunsetsu = buns; | |
237 | + m_CurTerminalBunsetsu.Add(buns); | |
238 | + } | |
239 | + // 終端ダミー文節を追加 | |
240 | + var dummy = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurCharPos, m_CurBunsetsu.BunsetsuPos + 1, String.Empty, -1, 0.0); | |
241 | + m_BunsetsuList.Add(dummy); | |
242 | + // 係り先が-1である文節をdummyに係るようにする。 | |
243 | + if (m_CurTerminalBunsetsu != null && m_CurTerminalBunsetsu.Count > 0) | |
244 | + { | |
245 | + foreach (var buns in m_CurTerminalBunsetsu) | |
246 | + { | |
247 | + buns.DependsTo = dummy.BunsetsuPos; | |
248 | + } | |
249 | + } | |
250 | + | |
251 | + if (++n % 1000 == 0) | |
252 | + { | |
253 | + Console.Write("> {0} Sentences.\r", n); | |
254 | + } | |
255 | + m_CurSen.EndChar = m_CurCharPos; | |
256 | + | |
257 | + m_SentencesInDoc.Add(m_CurSen); | |
258 | + m_Corpus.AddSentence(m_CurSen); | |
259 | + if (sentenceCount > 0 && n >= sentenceCount) | |
260 | + { | |
261 | + break; | |
262 | + } | |
263 | + // 以降のWordのために、新しいSentenceを初期化して用意する。 | |
264 | + int lastsenpos = m_CurSen.Pos + 1; | |
265 | + m_CurSen = new Sentence(m_CurDoc) { ID = m_CurSenID++, Pos = lastsenpos }; | |
266 | + m_CurSen.StartChar = m_CurCharPos; | |
267 | + m_CurBunsetsu = null; | |
268 | + m_CurTerminalBunsetsu.Clear(); | |
269 | + //m_CurSegments = new List<Segment>(); | |
270 | + m_CompositeWordChunk.Clear(); | |
271 | + lastAnnotationTag = null; | |
272 | + } | |
273 | + } | |
274 | + m_CurDoc.Text = m_DocumentTextBuilder.ToString(); | |
275 | + Console.Write("> {0} Sentences.\r", n); | |
276 | + | |
277 | + Console.WriteLine(); | |
278 | + // BunsetsuをSegment&LinkとしてCorpusに登録 | |
279 | + var bunsetsuTag = TagSet.FindTag(Tag.SEGMENT, "Bunsetsu"); | |
280 | + n = 0; | |
281 | + foreach (var buns in m_BunsetsuList.Values) | |
282 | + { | |
283 | + if (++n % 100 == 0) | |
284 | + { | |
285 | + Console.Write("> {0} Segments.\r", n); | |
286 | + } | |
287 | + var seg = new Segment(); | |
288 | + seg.StartChar = buns.StartPos; | |
289 | + seg.EndChar = buns.EndPos; | |
290 | + seg.Tag = bunsetsuTag; | |
291 | + seg.Doc = buns.Doc; | |
292 | + seg.Sentence = buns.Sen; | |
293 | + seg.Version = TagSet.CurrentVersion; | |
294 | + m_Corpus.AddSegment(seg); | |
295 | + buns.Seg = seg; | |
296 | + foreach (var w in buns.Words) | |
297 | + { | |
298 | + w.Bunsetsu = seg; | |
299 | + } | |
300 | + } | |
301 | + Console.WriteLine("> {0} Segments.", m_BunsetsuList.Count); | |
302 | + n = 0; | |
303 | + foreach (var buns in m_BunsetsuList.Values) | |
304 | + { | |
305 | + if (++n % 100 == 0) | |
306 | + { | |
307 | + Console.Write("> {0} Links.\r", n); | |
308 | + } | |
309 | + var depBunsetsu = m_BunsetsuList.Find(buns.Sen, buns.DependsTo); | |
310 | + if (depBunsetsu != null) | |
311 | + { | |
312 | + var link = new Link(); | |
313 | + link.From = buns.Seg; | |
314 | + link.To = depBunsetsu.Seg; | |
315 | + link.FromSentence = buns.Sen; | |
316 | + link.ToSentence = buns.Sen; | |
317 | + link.Tag = TagSet.FindOrAddTag(Tag.LINK, buns.DependsAs); | |
318 | + link.Version = TagSet.CurrentVersion; | |
319 | + link.Attributes.Add(new LinkAttribute() | |
320 | + { | |
321 | + Proj = link.Proj, | |
322 | + Target = link, | |
323 | + User = link.User, | |
324 | + Version = link.Version, | |
325 | + Key = "Score", | |
326 | + Value = buns.Score.ToString() | |
327 | + }); | |
328 | + m_Corpus.AddLink(link); | |
329 | + } | |
330 | + } | |
331 | + Console.WriteLine("> {0} Links.", m_BunsetsuList.Count); | |
332 | + } | |
333 | + | |
334 | + // FEATSフィールドの IOB2 タグに応じて複数の行に1語を割り当て、さらにその1語に1文節と係り受けを付与する. | |
335 | + // 複合語内の係り受けは破棄する. | |
336 | + private void ProcessOneLine_2(string s, string[] fields) | |
337 | + { | |
338 | + var feats = new string[0]; | |
339 | + if (fields.Length > 5 && fields[5] != "_") | |
340 | + { | |
341 | + feats = fields[5].Split('|'); | |
342 | + } | |
343 | + var btag = feats.FirstOrDefault(f => f.StartsWith("B-")); // 複数のBタグがあった場合、最初のもののみ有効. | |
344 | + var itag = feats.FirstOrDefault(f => f.StartsWith("I-")); | |
345 | + | |
346 | + // 複合語の終了か | |
347 | + if (itag == null) | |
348 | + { | |
349 | + // 現在までに収集したCompositeWordChunkがあれば先に出力 | |
350 | + if (!m_CompositeWordChunk.IsEmpty()) | |
351 | + { | |
352 | + try | |
353 | + { | |
354 | + var scw = m_CompositeWordChunk.ToConllSingleLine(); | |
355 | + var scwf = scw.Split('\t'); | |
356 | + ProcessOneLine_1(scw, scwf); | |
357 | + } | |
358 | + finally | |
359 | + { | |
360 | + m_CompositeWordChunk.Clear(); | |
361 | + } | |
362 | + } | |
363 | + } | |
364 | + if (btag != null) // BならChunkを初期化 | |
365 | + { | |
366 | + m_CompositeWordChunk.Clear(); | |
367 | + m_CompositeWordChunk.ChunkPOS = btag.Substring(2); | |
368 | + } | |
369 | + if (btag != null || itag != null) // BまたはI | |
370 | + { | |
371 | + // 現在の語をCompositeWordChunkに追加し、出力は行わない. | |
372 | + var f0 = Int32.Parse(fields[0]) - 1; | |
373 | + var f5 = (fields.Length > 5) ? fields[5] : "_"; | |
374 | + var f6 = f0 + 1; | |
375 | + if (fields.Length > 6 && fields[6] != "_") | |
376 | + { | |
377 | + f6 = Int32.Parse(fields[6]) - 1; | |
378 | + } | |
379 | + var f7 = (fields.Length > 7) ? fields[7] : string.Empty; | |
380 | + m_CompositeWordChunk.Add(f0, fields[1], fields[2], f5, f6, f7); | |
381 | + } | |
382 | + else // BでもIでもない | |
383 | + { | |
384 | + // 現在の行を出力 | |
385 | + ProcessOneLine_1(s, fields); | |
386 | + } | |
387 | + } | |
388 | + | |
389 | + public void ReadLexiconFromStream(TextReader rdr, bool baseOnly) | |
390 | + { | |
391 | + throw new System.NotImplementedException(); | |
392 | + } | |
393 | + | |
394 | + public void ReadLexiconFromStream(TextReader rdr) | |
395 | + { | |
396 | + throw new System.NotImplementedException(); | |
397 | + } | |
398 | + | |
399 | + public void SetFieldDefs(Field[] fieldDefs) | |
400 | + { | |
401 | + throw new System.NotImplementedException(); | |
402 | + } | |
403 | + | |
404 | + /// <summary> | |
405 | + /// 行頭のBOMを除去する | |
406 | + /// </summary> | |
407 | + /// <param name="input"></param> | |
408 | + /// <returns></returns> | |
409 | + public static string Cleanup(string input) | |
410 | + { | |
411 | + var sb = new StringBuilder(); | |
412 | + var firstChar = true; | |
413 | + foreach (var c in input) | |
414 | + { | |
415 | + if (c != 0xFEFF) | |
416 | + { | |
417 | + sb.Append(c); | |
418 | + if (firstChar) | |
419 | + { | |
420 | + return input; | |
421 | + } | |
422 | + } | |
423 | + firstChar = false; | |
424 | + } | |
425 | + return sb.ToString(); | |
426 | + } | |
427 | + } | |
428 | +} |
@@ -90,6 +90,10 @@ | ||
90 | 90 | { |
91 | 91 | rdr = new ConllReader(cps, lb); |
92 | 92 | } |
93 | + else if (def.LineFormat == "CONLLU") | |
94 | + { | |
95 | + rdr = new ConllUReader(cps, lb); | |
96 | + } | |
93 | 97 | else |
94 | 98 | { |
95 | 99 | throw new Exception(string.Format("Invalid Reader Type: {0}", readerType)); |