CONLLU import/export実装完了
DOC/SENTENCETAGの仕様を変更(Key名が"@"で始まるものはSentence専用のattributeとする)
Export処理でprojidを指定できるよう修正(途中)
@@ -161,7 +161,10 @@ | ||
161 | 161 | rows.Add(new AttributeData("ID", sen.ParentDoc.ID.ToString(), AttributeGridRowType.ReadOnly)); |
162 | 162 | foreach (DocumentAttribute a in sen.ParentDoc.Attributes) |
163 | 163 | { |
164 | - rows.Add(new AttributeData(a.Key, a.Value, AttributeGridRowType.KeyValueWritable)); | |
164 | + if (!a.Key.StartsWith("@")) | |
165 | + { | |
166 | + rows.Add(new AttributeData(a.Key, a.Value, AttributeGridRowType.KeyValueWritable)); | |
167 | + } | |
165 | 168 | } |
166 | 169 | // --> ここだけ変則的。呼び出し元のSetSource()に存在すべきものである。Sentence/Documentについても、sourceをobject[]にすればすっきりする(TODO)。 |
167 | 170 | if (this.attributeGrid1.IsEditing) |
@@ -499,6 +499,10 @@ | ||
499 | 499 | { |
500 | 500 | filterStr.AppendFormat("{0}|*.conll|", def.Name); |
501 | 501 | } |
502 | + else if (def.Name == "CONLLU") | |
503 | + { | |
504 | + filterStr.AppendFormat("{0}|*.conllu|", def.Name); | |
505 | + } | |
502 | 506 | else //TODO: とりあえずテキストエクスポートは選択できないようにしておく |
503 | 507 | { |
504 | 508 | continue; |
@@ -554,6 +558,10 @@ | ||
554 | 558 | { |
555 | 559 | svc = new ExportServiceConll(wr); |
556 | 560 | } |
561 | + else if (def.Name == "CONLLU") | |
562 | + { | |
563 | + svc = new ExportServiceConllU(wr); | |
564 | + } | |
557 | 565 | else |
558 | 566 | { |
559 | 567 | svc = new ExportServiceCabocha(wr, def); |
@@ -51,7 +51,7 @@ | ||
51 | 51 | { |
52 | 52 | OpenFileFolderDialog dlg = new OpenFileFolderDialog(); |
53 | 53 | dlg.Title = "Select Input File/Folder"; |
54 | - dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*"; | |
54 | + dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|CONLLU files (*.conllu)|*.conllu|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*"; | |
55 | 55 | if (ImportFileLastSelectedFilterIndex > 0) |
56 | 56 | { |
57 | 57 | dlg.FilterIndex = ImportFileLastSelectedFilterIndex; |
@@ -101,7 +101,7 @@ | ||
101 | 101 | { |
102 | 102 | OpenFileFolderDialog dlg = new OpenFileFolderDialog(); |
103 | 103 | dlg.Title = "Select Input File/Folder"; |
104 | - dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*"; | |
104 | + dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|CONLLU files (*.conllu)|*.conllu|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*"; | |
105 | 105 | dlg.FileMustExist = true; |
106 | 106 | if (ImportFileLastSelectedFilterIndex > 0) |
107 | 107 | { |
@@ -25,7 +25,14 @@ | ||
25 | 25 | { |
26 | 26 | using (XmlWriter wr = new XmlTextWriter(twr)) |
27 | 27 | { |
28 | - wr.WriteElementString(this.Key, this.Value); | |
28 | + if (this.Key.StartsWith("@")) | |
29 | + { | |
30 | + wr.WriteElementString(this.Key.Substring(1), this.Value); | |
31 | + } | |
32 | + else | |
33 | + { | |
34 | + wr.WriteElementString(this.Key, this.Value); | |
35 | + } | |
29 | 36 | return twr.ToString(); |
30 | 37 | } |
31 | 38 | } |
@@ -22,6 +22,15 @@ | ||
22 | 22 | /// </summary> |
23 | 23 | public virtual string Text { get; set; } |
24 | 24 | |
25 | + /// <summary> | |
26 | + /// 2021.1.11仕様追加 | |
27 | + /// 従来より、DocumentAttributeの一部はSentenceAttributeとして使われている. | |
28 | + /// (Sentence自体はDocumentAttributeへのIDしか持てない.) | |
29 | + /// SentenceAttribute専用のAttributeを区別するため、Keyの先頭に"@"を付与するものとする. | |
30 | + /// Export時に、 | |
31 | + /// @付きのAttributeは、SENTENCETAGIDとしてのみ出力される. | |
32 | + /// @なしのAttributeは、DOCIDおよびDocumentの#ATTRタグに出力される. | |
33 | + /// </summary> | |
25 | 34 | public virtual ISet<DocumentAttribute> Attributes { get; set; } |
26 | 35 | |
27 | 36 | public Document() |
@@ -76,7 +85,10 @@ | ||
76 | 85 | { |
77 | 86 | foreach (DocumentAttribute attr in this.Attributes) |
78 | 87 | { |
79 | - wr.WriteElementString(attr.Key, attr.Value); | |
88 | + if (!attr.Key.StartsWith("@")) | |
89 | + { | |
90 | + wr.WriteElementString(attr.Key, attr.Value); | |
91 | + } | |
80 | 92 | } |
81 | 93 | return twr.ToString(); |
82 | 94 | } |
@@ -12,5 +12,5 @@ | ||
12 | 12 | [assembly: AssemblyCulture("")] |
13 | 13 | [assembly: ComVisible(false)] |
14 | 14 | [assembly: Guid("6a95808a-d1e3-47de-bb62-7ed7a281ac0b")] |
15 | -[assembly: AssemblyVersion("3.14.642.0")] | |
16 | -[assembly: AssemblyFileVersion("3.14.642.0")] | |
15 | +[assembly: AssemblyVersion("3.14.648.0")] | |
16 | +[assembly: AssemblyFileVersion("3.14.648.0")] |
@@ -114,7 +114,7 @@ | ||
114 | 114 | //} |
115 | 115 | |
116 | 116 | // Groupは直接文と関連ついていないので、segment->group_member->groupと順にたどって、文に付いているGroupを求める. |
117 | - public static IList<Group> RetrieveWordGroups(ISession sess, Sentence sen) | |
117 | + public static IList<Group> RetrieveWordGroups(ISession sess, Sentence sen, int projid) | |
118 | 118 | { |
119 | 119 | var sgtags = GetSegmentTagIDForGroups(sess); // これはSegmentのタグで、Groupに関係あるもの. |
120 | 120 | if (sgtags.Count == 0) |
@@ -131,9 +131,10 @@ | ||
131 | 131 | List<long> segs = new List<long>(); |
132 | 132 | using (var cmd = sess.Connection.CreateCommand()) |
133 | 133 | { |
134 | - cmd.CommandText = string.Format("SELECT id from segment" | |
135 | - + " WHERE document_id={0} AND sentence_id={1} AND tag_definition_id IN {2}", | |
136 | - sen.ParentDoc.ID, sen.ID, BuildIDList(sgtags)); | |
134 | + cmd.CommandText = "SELECT id from segment" + | |
135 | + $" WHERE document_id={sen.ParentDoc.ID} AND sentence_id={sen.ID}" + | |
136 | + $" AND tag_definition_id IN {BuildIDList(sgtags)}" + | |
137 | + $" AND project_id={projid}"; | |
137 | 138 | var rdr = cmd.ExecuteReader(); |
138 | 139 | while (rdr.Read()) |
139 | 140 | { |
@@ -172,7 +173,7 @@ | ||
172 | 173 | return q.List<Group>(); |
173 | 174 | } |
174 | 175 | |
175 | - public static IList<Group> RetrieveMWEGroups(ISession sess, Sentence sen) | |
176 | + public static IList<Group> RetrieveMWEGroups(ISession sess, Sentence sen, int projid) | |
176 | 177 | { |
177 | 178 | if (MweSegTag < 0) |
178 | 179 | { |
@@ -188,7 +189,7 @@ | ||
188 | 189 | { |
189 | 190 | cmd.CommandText = "SELECT id from segment" |
190 | 191 | + $" WHERE document_id={sen.ParentDoc.ID} AND sentence_id={sen.ID} " |
191 | - + $"AND tag_definition_id={MweSegTag}"; | |
192 | + + $"AND tag_definition_id={MweSegTag} AND project_id={projid}"; | |
192 | 193 | var rdr = cmd.ExecuteReader(); |
193 | 194 | while (rdr.Read()) |
194 | 195 | { |
@@ -205,8 +206,9 @@ | ||
205 | 206 | List<long> groupids = new List<long>(); |
206 | 207 | using (var cmd = sess.Connection.CreateCommand()) |
207 | 208 | { |
208 | - cmd.CommandText = "SELECT DISTINCT group_id FROM group_member" | |
209 | - + $" WHERE member_id IN {BuildIDList(segs)} AND object_Type='{Tag.SEGMENT}' ORDER BY group_id ASC"; | |
209 | + cmd.CommandText = "SELECT DISTINCT group_id FROM group_member" + | |
210 | + $" WHERE member_id IN {BuildIDList(segs)} " + | |
211 | + $"AND object_Type='{Tag.SEGMENT}' ORDER BY group_id ASC"; | |
210 | 212 | var rdr = cmd.ExecuteReader(); |
211 | 213 | while (rdr.Read()) |
212 | 214 | { |
@@ -231,17 +233,17 @@ | ||
231 | 233 | /// <param name="sess"></param> |
232 | 234 | /// <param name="sen"></param> |
233 | 235 | /// <returns></returns> |
234 | - public static IList<long> RetrieveMiscSegments(ISession sess, Sentence sen) | |
236 | + public static IList<long> RetrieveMiscSegments(ISession sess, Sentence sen, int projid) | |
235 | 237 | { |
236 | 238 | var gtag = GetBunsetsuTagId(sess); |
237 | 239 | |
238 | 240 | using (var cmd = sess.Connection.CreateCommand()) |
239 | 241 | { |
240 | - cmd.CommandText = string.Format( | |
241 | - "SELECT s.id FROM segment s" | |
242 | - + " INNER JOIN tag_definition t ON s.tag_definition_id=t.id" | |
243 | - + " WHERE s.document_id={0} AND s.sentence_id={1} and s.tag_definition_id!={2}", | |
244 | - sen.ParentDoc.ID, sen.ID, gtag); | |
242 | + cmd.CommandText = | |
243 | + "SELECT s.id FROM segment s" + | |
244 | + " INNER JOIN tag_definition t ON s.tag_definition_id=t.id" + | |
245 | + $" WHERE s.document_id={sen.ParentDoc.ID} AND s.sentence_id={sen.ID}" + | |
246 | + $" and s.tag_definition_id!={gtag} and s.project_id={projid}"; | |
245 | 247 | var rdr = cmd.ExecuteReader(); |
246 | 248 | var result = new List<long>(); |
247 | 249 | while (rdr.Read()) |
@@ -323,19 +325,19 @@ | ||
323 | 325 | /// <param name="sess"></param> |
324 | 326 | /// <param name="sen"></param> |
325 | 327 | /// <returns></returns> |
326 | - public static IList<long> RetrieveMiscLinks(ISession sess, Sentence sen) | |
328 | + public static IList<long> RetrieveMiscLinks(ISession sess, Sentence sen, int projid) | |
327 | 329 | { |
328 | 330 | long bunsetsuid = GetBunsetsuTagId(sess); |
329 | 331 | |
330 | 332 | using (var cmd = sess.Connection.CreateCommand()) |
331 | 333 | { |
332 | - cmd.CommandText = string.Format( | |
333 | - "SELECT l.id FROM link l" | |
334 | - + " INNER JOIN tag_definition t ON t.id=l.tag_definition_id" | |
335 | - + " INNER JOIN segment s1 ON s1.id=l.from_segment_id" | |
336 | - + " INNER JOIN segment s2 ON s2.id=l.to_segment_id" | |
337 | - + " WHERE l.from_sentence_id={0} AND s1.tag_definition_id !={1} AND s2.tag_definition_id != {1}", | |
338 | - sen.ID, bunsetsuid); | |
334 | + cmd.CommandText = | |
335 | + "SELECT l.id FROM link l" + | |
336 | + " INNER JOIN tag_definition t ON t.id=l.tag_definition_id" + | |
337 | + " INNER JOIN segment s1 ON s1.id=l.from_segment_id" + | |
338 | + " INNER JOIN segment s2 ON s2.id=l.to_segment_id" + | |
339 | + $" WHERE l.from_sentence_id={sen.ID} AND s1.tag_definition_id !={bunsetsuid}" + | |
340 | + $" AND s2.tag_definition_id != {1} AND l.project_id={projid}"; | |
339 | 341 | var rdr = cmd.ExecuteReader(); |
340 | 342 | var result = new List<long>(); |
341 | 343 | while (rdr.Read()) |
@@ -373,13 +375,15 @@ | ||
373 | 375 | /// <param name="startPos"></param> |
374 | 376 | /// <param name="endPos"></param> |
375 | 377 | /// <returns></returns> |
376 | - public static IList<Segment> RetrieveBunsetsu(ISession sess, Sentence sen) | |
378 | + public static IList<Segment> RetrieveBunsetsu(ISession sess, Sentence sen, int projid) | |
377 | 379 | { |
378 | 380 | long bunsetsuid = GetBunsetsuTagId(sess); |
379 | 381 | |
380 | 382 | var q = sess.CreateQuery( |
381 | - string.Format("from Segment seg where seg.Doc.ID = {0} and seg.Sentence.ID = {1} and seg.Tag.ID = {2} order by seg.StartChar", | |
382 | - sen.ParentDoc.ID, sen.ID, bunsetsuid)); | |
383 | + $"from Segment seg where seg.Doc.ID = {sen.ParentDoc.ID}" + | |
384 | + $" and seg.Sentence.ID = {sen.ID}" + | |
385 | + $" and seg.Tag.ID = {bunsetsuid}" + | |
386 | + $" and seg.Proj.ID = {projid} order by seg.StartChar"); | |
383 | 387 | return q.List<Segment>(); |
384 | 388 | } |
385 | 389 |
@@ -386,7 +386,7 @@ | ||
386 | 386 | { |
387 | 387 | throw new InvalidOperationException("Unknown sentence."); |
388 | 388 | } |
389 | - return Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen); | |
389 | + return Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen, m_Context.Proj.ID); | |
390 | 390 | } |
391 | 391 | |
392 | 392 | public TagSet GetTagSet() |
@@ -1208,11 +1208,11 @@ | ||
1208 | 1208 | { |
1209 | 1209 | throw new InvalidOperationException("Unknown sentence."); |
1210 | 1210 | } |
1211 | - var groups = Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen); | |
1211 | + var groups = Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen, m_Context.Proj.ID); | |
1212 | 1212 | var mwegroups = from g in groups where g.Tag.Name == "MWE" select g; |
1213 | 1213 | |
1214 | 1214 | // この文のBunsetsu Segmentをリスト |
1215 | - var buns = Util.RetrieveBunsetsu(m_Context.Session, m_Context.Sen); | |
1215 | + var buns = Util.RetrieveBunsetsu(m_Context.Session, m_Context.Sen, m_Context.Proj.ID); | |
1216 | 1216 | // この文のWordをリスト |
1217 | 1217 | var words = m_Context.Sen.GetWords(m_Context.Proj.ID); |
1218 | 1218 | if (buns.Count() != words.Count() + 1) |
@@ -123,7 +123,7 @@ | ||
123 | 123 | private bool WriteBunsetsuAndAnnotations(Sentence sen) |
124 | 124 | { |
125 | 125 | var hasAnnotation = false; |
126 | - var grps = Util.RetrieveWordGroups(m_Session, sen); | |
126 | + var grps = Util.RetrieveWordGroups(m_Session, sen, m_ProjId); | |
127 | 127 | |
128 | 128 | var senOffset = sen.StartChar; |
129 | 129 | var segno = 0; // 本メソッドローカルのSegment番号(文内) |
@@ -183,7 +183,7 @@ | ||
183 | 183 | } |
184 | 184 | |
185 | 185 | // Groupの一部およびCabochaで表現される以外のSegmentを出力する |
186 | - var segids = Util.RetrieveMiscSegments(m_Session, sen); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment | |
186 | + var segids = Util.RetrieveMiscSegments(m_Session, sen, m_ProjId); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment | |
187 | 187 | foreach (var segid in segids) |
188 | 188 | { |
189 | 189 | if (seg_index.ContainsKey(segid)) // 既にGroupの一部としてExportされたSegmentは除外する. |
@@ -209,7 +209,7 @@ | ||
209 | 209 | } |
210 | 210 | |
211 | 211 | // Cabochaで表現される以外のLinkを出力する |
212 | - IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen); // fromsegid, tosegid, tagname, comment | |
212 | + IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen, m_ProjId); // fromsegid, tosegid, tagname, comment | |
213 | 213 | foreach (var linkid in linkids) |
214 | 214 | { |
215 | 215 | try |
@@ -25,6 +25,8 @@ | ||
25 | 25 | protected HashSet<Namespace> m_Namespaces = new HashSet<Namespace>(); |
26 | 26 | protected ReaderDef m_Def; |
27 | 27 | |
28 | + protected int m_ProjId = 0; | |
29 | + | |
28 | 30 | public void Dispose() |
29 | 31 | { |
30 | 32 | Dispose(true); |
@@ -26,6 +26,7 @@ | ||
26 | 26 | protected int m_CurrentDocId; |
27 | 27 | protected Action<Lexeme> m_LexemeWriter; |
28 | 28 | protected Dictionary<Document, int> m_DocOffsetCache; |
29 | + protected bool m_OutputSentencetagids = true; | |
29 | 30 | |
30 | 31 | public ExportServiceCabocha(TextWriter wr) |
31 | 32 | { |
@@ -117,10 +118,7 @@ | ||
117 | 118 | foreach (var a in sen.Attributes) |
118 | 119 | { |
119 | 120 | string csa = string.Format("{0}:{1}", ki.Crps.Name, a.ID); |
120 | - if (m_SentenceTags.TryGetValue(csa, out pair)) | |
121 | - { | |
122 | - m_TextWriter.WriteLine("#! SENTENCETAG {0}", pair.Seqid); | |
123 | - } | |
121 | + WriteSentenceTag(csa); | |
124 | 122 | } |
125 | 123 | |
126 | 124 | WriteWords(sen.GetWords(0)); |
@@ -128,6 +126,14 @@ | ||
128 | 126 | WriteEos(); |
129 | 127 | } |
130 | 128 | |
129 | + protected virtual void WriteSentenceTag(string csa) | |
130 | + { | |
131 | + if (m_SentenceTags.TryGetValue(csa, out var pair)) | |
132 | + { | |
133 | + m_TextWriter.WriteLine("#! SENTENCETAG {0}", pair.Seqid); | |
134 | + } | |
135 | + } | |
136 | + | |
131 | 137 | static public long t1 = 0; |
132 | 138 | static public long t2 = 0; |
133 | 139 | static public long t3 = 0; |
@@ -141,6 +147,8 @@ | ||
141 | 147 | { |
142 | 148 | if (m_TextWriter == null) throw new InvalidOperationException("TextWriter is null."); |
143 | 149 | |
150 | + m_ProjId = project_id; | |
151 | + | |
144 | 152 | var t0 = DateTime.Now.Ticks; |
145 | 153 | #if false // NHibernateでアクセスするとn^2オーダーの時間がかかる. |
146 | 154 | var segs = m_Session.CreateSQLQuery( |
@@ -161,8 +169,7 @@ | ||
161 | 169 | m_Segs = new Dictionary<long, int>(); |
162 | 170 | using (var cmd = m_Session.Connection.CreateCommand()) |
163 | 171 | { |
164 | - cmd.CommandText = string.Format("SELECT id FROM segment WHERE sentence_id={0} and tag_definition_id={1} ORDER BY start_char", | |
165 | - sen.ID, tagid); | |
172 | + cmd.CommandText = $"SELECT id FROM segment WHERE sentence_id={sen.ID} and tag_definition_id={tagid} and project_id={m_ProjId} ORDER BY start_char"; | |
166 | 173 | var rdr = cmd.ExecuteReader(); |
167 | 174 | long last = -1; |
168 | 175 | int i = 0; |
@@ -207,7 +214,7 @@ | ||
207 | 214 | var tosegid = (long)rdr[2]; |
208 | 215 | var linktagname = (string)rdr[3]; |
209 | 216 | var score = 0.0; |
210 | - Double.TryParse((rdr[4] as string)??string.Empty, out score); | |
217 | + Double.TryParse((rdr[4] as string) ?? string.Empty, out score); | |
211 | 218 | m_Links.Add(fromsegid, new object[] { linkid, tosegid, linktagname, score }); |
212 | 219 | } |
213 | 220 | rdr.Close(); |
@@ -232,10 +239,7 @@ | ||
232 | 239 | foreach (var a in sen.Attributes) |
233 | 240 | { |
234 | 241 | string csa = string.Format("{0}:{1}", crps.Name, a.ID); |
235 | - if (m_SentenceTags.TryGetValue(csa, out pair)) | |
236 | - { | |
237 | - m_TextWriter.WriteLine("#! SENTENCETAG {0}", pair.Seqid); | |
238 | - } | |
242 | + WriteSentenceTag(csa); | |
239 | 243 | } |
240 | 244 | |
241 | 245 | t3 += (DateTime.Now.Ticks - t0); |
@@ -252,7 +256,7 @@ | ||
252 | 256 | cmd.CommandText = |
253 | 257 | "SELECT w.lexeme_id, w.bunsetsu_segment_id, w.head_info, w.extra_chars" + |
254 | 258 | " FROM word w" + |
255 | - $" WHERE w.sentence_id={sen.ID} AND w.project_id={project_id} ORDER BY w.position ASC"; | |
259 | + $" WHERE w.sentence_id={sen.ID} AND w.project_id={m_ProjId} ORDER BY w.position ASC"; | |
256 | 260 | var rdr = cmd.ExecuteReader(); |
257 | 261 | while (rdr.Read()) |
258 | 262 | { |
@@ -368,9 +372,12 @@ | ||
368 | 372 | { |
369 | 373 | if (m_TextWriter == null) throw new InvalidOperationException("TextWriter is null."); |
370 | 374 | |
371 | - foreach (KeyValuePair<string, SeqIDTagPair> pair in m_SentenceTags) | |
375 | + if (m_OutputSentencetagids) | |
372 | 376 | { |
373 | - m_TextWriter.Write(pair.Value.Tag); | |
377 | + foreach (KeyValuePair<string, SeqIDTagPair> pair in m_SentenceTags) | |
378 | + { | |
379 | + m_TextWriter.Write(pair.Value.Tag); | |
380 | + } | |
374 | 381 | } |
375 | 382 | } |
376 | 383 |
@@ -413,7 +420,7 @@ | ||
413 | 420 | protected bool WriteAnnotations(Sentence sen) |
414 | 421 | { |
415 | 422 | bool hasAnnotation = false; |
416 | - IList<Group> grps = Util.RetrieveWordGroups(m_Session, sen); | |
423 | + IList<Group> grps = Util.RetrieveWordGroups(m_Session, sen, m_ProjId); | |
417 | 424 | |
418 | 425 | int senOffset = sen.StartChar; |
419 | 426 | int segno = 0; // 本メソッドローカルのSegment番号(文内) |
@@ -438,7 +445,7 @@ | ||
438 | 445 | } |
439 | 446 | |
440 | 447 | // Groupの一部およびCabochaで表現される以外のSegmentを出力する |
441 | - IList<long> segids = Util.RetrieveMiscSegments(m_Session, sen); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment | |
448 | + IList<long> segids = Util.RetrieveMiscSegments(m_Session, sen, m_ProjId); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment | |
442 | 449 | foreach (var segid in segids) |
443 | 450 | { |
444 | 451 | if (seg_index.ContainsKey(segid)) // 既にGroupの一部としてExportされたSegmentは除外する. |
@@ -464,7 +471,7 @@ | ||
464 | 471 | } |
465 | 472 | |
466 | 473 | // Cabochaで表現される以外のLinkを出力する |
467 | - IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen); // fromsegid, tosegid, tagname, comment | |
474 | + IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen, m_ProjId); // fromsegid, tosegid, tagname, comment | |
468 | 475 | foreach (var linkid in linkids) |
469 | 476 | { |
470 | 477 | try |
@@ -525,7 +532,10 @@ | ||
525 | 532 | { |
526 | 533 | foreach (var attr in doc.Attributes) |
527 | 534 | { |
528 | - m_TextWriter.WriteLine("#! ATTR \"{0}\" \"{1}\" \"{2}\"", attr.Key, attr.Value, Util.EscapeQuote(attr.Comment)); | |
535 | + if (!attr.Key.StartsWith("@")) // "@"で始まるのはSentenceAttributeなのでDocumentAttributeとしては出力しない(SENTENCETAGIDとして出力される) | |
536 | + { | |
537 | + m_TextWriter.WriteLine("#! ATTR \"{0}\" \"{1}\" \"{2}\"", attr.Key, attr.Value, Util.EscapeQuote(attr.Comment)); | |
538 | + } | |
529 | 539 | } |
530 | 540 | return; |
531 | 541 | } |
@@ -19,11 +19,11 @@ | ||
19 | 19 | { |
20 | 20 | } |
21 | 21 | |
22 | - private long currentBunsetsuID; | |
23 | - private int bunsetsuPos; | |
24 | - private List<string> sb = new List<string>(); // Form | |
25 | - private List<string> sb2 = new List<string>(); // Base | |
26 | - private const string EMPTY_COL = "_"; | |
22 | + protected long currentBunsetsuID; | |
23 | + protected int bunsetsuPos; | |
24 | + protected List<string> sb = new List<string>(); // Form | |
25 | + protected List<string> sb2 = new List<string>(); // Base | |
26 | + protected const string EMPTY_COL = "_"; | |
27 | 27 | |
28 | 28 | protected override void WriteWords(IList<Word> words) |
29 | 29 | { |
@@ -110,7 +110,7 @@ | ||
110 | 110 | m_TextWriter.WriteLine(); |
111 | 111 | } |
112 | 112 | |
113 | - private int GetDependToIndex(long buns_id, out string dependAs) | |
113 | + protected int GetDependToIndex(long buns_id, out string dependAs) | |
114 | 114 | { |
115 | 115 | long toSegID = -1; |
116 | 116 | object[] link = null; |
@@ -129,7 +129,7 @@ | ||
129 | 129 | return index; |
130 | 130 | } |
131 | 131 | |
132 | - private string GetFeats(Word w) | |
132 | + protected virtual string GetFeats(Word w) | |
133 | 133 | { |
134 | 134 | var feats = new List<string>(); |
135 | 135 | if (w.Extras.Length > 0) |
@@ -0,0 +1,189 @@ | ||
1 | +using ChaKi.Entity.Corpora; | |
2 | +using ChaKi.Entity.Corpora.Annotations; | |
3 | +using System; | |
4 | +using System.Collections.Generic; | |
5 | +using System.IO; | |
6 | +using System.Linq; | |
7 | +using System.Text; | |
8 | +using System.Xml; | |
9 | + | |
10 | +namespace ChaKi.Service.Export | |
11 | +{ | |
12 | + public class ExportServiceConllU : ExportServiceConll | |
13 | + { | |
14 | + public ExportServiceConllU(TextWriter wr) | |
15 | + : base(wr) | |
16 | + { | |
17 | + // SENTENCETAGは出力しない. 代わりに各Sentenceの冒頭でコメント行を出力. | |
18 | + m_OutputSentencetagids = false; | |
19 | + } | |
20 | + | |
21 | + protected override void WriteWords(IList<Word> words) | |
22 | + { | |
23 | + // ここで渡されるwordsには、Bunsetsu.ID, HeadInfo, Lex, Extras しかセットされていないtransient objectなので注意. | |
24 | + // ExportServiceCabocha.ExportItem()を参照. | |
25 | + | |
26 | + currentBunsetsuID = -1; | |
27 | + bunsetsuPos = 0; | |
28 | + sb.Clear(); | |
29 | + sb2.Clear(); | |
30 | + | |
31 | + var t00 = DateTime.Now.Ticks; | |
32 | + | |
33 | + var wordsarray = words.Cast<Word>().ToArray(); | |
34 | + | |
35 | + t5 += (DateTime.Now.Ticks - t00); | |
36 | + t00 = DateTime.Now.Ticks; | |
37 | + | |
38 | + int n = wordsarray.Length; | |
39 | + for (int i = 0; i < n; i++) | |
40 | + { | |
41 | + var t0 = DateTime.Now.Ticks; | |
42 | + | |
43 | + Word w = wordsarray[i]; | |
44 | + | |
45 | + t1 += (DateTime.Now.Ticks - t0); | |
46 | + t0 = DateTime.Now.Ticks; | |
47 | + | |
48 | + var lex = w.Lex; | |
49 | + if (lex == null) | |
50 | + { | |
51 | + continue; | |
52 | + } | |
53 | + sb.Add(lex.Surface); | |
54 | + sb.Add(w.Extras); | |
55 | + sb2.Add(lex.BaseLexeme.Surface); | |
56 | + sb2.Add(w.Extras); | |
57 | + | |
58 | + var pos = lex.PartOfSpeech; | |
59 | + t2 += (DateTime.Now.Ticks - t0); | |
60 | + t0 = DateTime.Now.Ticks; | |
61 | + | |
62 | + var buns = w.Bunsetsu.ID; | |
63 | + if (currentBunsetsuID != buns || i == n - 1) | |
64 | + { | |
65 | + // Output word + dependency | |
66 | + // 自身 | |
67 | + int buns_pos = -1; | |
68 | + m_Segs.TryGetValue(buns, out buns_pos); | |
69 | + // 係り先 | |
70 | + string depAs; | |
71 | + var depTo = GetDependToIndex(buns, out depAs); | |
72 | + | |
73 | + if (sb.Count > 0) | |
74 | + { | |
75 | + sb.RemoveAt(sb.Count - 1); // 最後のExtraは出力しない. | |
76 | + } | |
77 | + if (sb2.Count > 0) | |
78 | + { | |
79 | + sb2.RemoveAt(sb2.Count - 1); | |
80 | + } | |
81 | + m_TextWriter.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}", | |
82 | + buns_pos + 1, // ID | |
83 | + string.Join("", sb.ToArray()), // FORM | |
84 | + string.Join("", sb2.ToArray()), // LEMMA | |
85 | + pos.Name1, // UPOS | |
86 | + GetXPOS(pos), // XPOS | |
87 | + GetFeats(w), // FEATS | |
88 | + depTo + 1, // HEAD | |
89 | + depAs, // DEPREL | |
90 | + EMPTY_COL, // DEPS | |
91 | + GetMisc(buns, lex)); // MISC | |
92 | + sb.Clear(); | |
93 | + sb2.Clear(); | |
94 | + currentBunsetsuID = buns; | |
95 | + bunsetsuPos++; | |
96 | + } | |
97 | + } | |
98 | + t6 += (DateTime.Now.Ticks - t00); | |
99 | + } | |
100 | + | |
101 | + private string GetXPOS(PartOfSpeech pos) | |
102 | + { | |
103 | + // pos.Nameの'-'で区切られた最初の要素を除く部分を返す. | |
104 | + var p = pos.Name.IndexOf('-'); | |
105 | + if (p >= 0 && p < (pos.Name.Length - 1)) | |
106 | + { | |
107 | + return pos.Name.Substring(p + 1); | |
108 | + } | |
109 | + return string.Empty; | |
110 | + } | |
111 | + | |
112 | + protected override string GetFeats(Word w) | |
113 | + { | |
114 | + var feats = new List<string>(); | |
115 | + if (w.Extras.Length > 0) | |
116 | + { | |
117 | + feats.Add("SP"); | |
118 | + } | |
119 | + // 今のところ、Word Featureをこれ以外に保持していない。Segment Attributeをここに入れるか? | |
120 | + | |
121 | + var result = string.Join("|", feats.ToArray()); | |
122 | + if (result.Length > 0) | |
123 | + { | |
124 | + return result; | |
125 | + } | |
126 | + return EMPTY_COL; | |
127 | + } | |
128 | + | |
129 | + // Segment AttributeとLexeme.CustomPropertyからCONLLU MISCフィールドを生成 | |
130 | + private string GetMisc(long segid, Lexeme lex) | |
131 | + { | |
132 | + var fields = new List<string>(); | |
133 | + | |
134 | + // Segment Attributes | |
135 | + using (var cmd = m_Session.Connection.CreateCommand()) | |
136 | + { | |
137 | + cmd.CommandText = | |
138 | + "SELECT attribute_key,attribute_value" + | |
139 | + " FROM segment_attribute" + | |
140 | + $" WHERE segment_id={segid} AND project_id={m_ProjId} ORDER BY id ASC"; | |
141 | + var rdr = cmd.ExecuteReader(); | |
142 | + while (rdr.Read()) | |
143 | + { | |
144 | + var key = rdr.GetString(0); | |
145 | + var val = rdr.GetString(1); | |
146 | + fields.Add($"{key}={val}"); | |
147 | + } | |
148 | + } | |
149 | + | |
150 | + | |
151 | + // Lexeme.CustomProperty | |
152 | + var cust = lex.CustomProperty; | |
153 | + foreach (var prop in cust.Split('\n')) | |
154 | + { | |
155 | + var pair = prop.Split('\t'); | |
156 | + if (pair.Length == 2) | |
157 | + { | |
158 | + fields.Add($"{pair[0]}={pair[1]}"); | |
159 | + } | |
160 | + } | |
161 | + | |
162 | + if (fields.Count > 0) | |
163 | + { | |
164 | + return string.Join("|", fields); | |
165 | + } | |
166 | + return EMPTY_COL; | |
167 | + } | |
168 | + | |
169 | + protected override void WriteSentenceTag(string csa) | |
170 | + { | |
171 | + m_SentenceTags.TryGetValue(csa, out var seqtagpair); | |
172 | + var t = seqtagpair.Tag.Split('\t'); | |
173 | + if (t.Length < 3) return; | |
174 | + var s = $"<Root>{t[2]}</Root>"; | |
175 | + using (TextReader trdr = new StringReader(s)) | |
176 | + { | |
177 | + XmlReader xrdr = XmlReader.Create(trdr); | |
178 | + while (xrdr.Read()) | |
179 | + { | |
180 | + if (xrdr.Name.Equals("Root")) continue; | |
181 | + var key = xrdr.Name; | |
182 | + var val = xrdr.ReadString(); | |
183 | + if (key.Length == 0) continue; | |
184 | + m_TextWriter.WriteLine($"# {key} = {val}"); | |
185 | + } | |
186 | + } | |
187 | + } | |
188 | + } | |
189 | +} |
@@ -61,7 +61,7 @@ | ||
61 | 61 | |
62 | 62 | private void ExportItem(Corpus crps, Sentence sen, int project_id = 0) |
63 | 63 | { |
64 | - var groups = Util.RetrieveMWEGroups(m_Session, sen); | |
64 | + var groups = Util.RetrieveMWEGroups(m_Session, sen, project_id); | |
65 | 65 | |
66 | 66 | if (groups.Count > 0) |
67 | 67 | { |
@@ -24,6 +24,8 @@ | ||
24 | 24 | public double Score { get; set; } |
25 | 25 | public int HeadInd { get; set; } // 自立語主辞の文節内位置 |
26 | 26 | public int HeadAnc { get; set; } // 付属語(Ancillary word)主辞の文節内位置 |
27 | + public string Comment { get; set; } | |
28 | + public Dictionary<string, string> Attrs { get; set; } = new Dictionary<string, string>(); | |
27 | 29 | |
28 | 30 | public CabochaBunsetsu(Sentence sen, Document doc, int startPos, int bunsetsuPos, string dependsAs, int dependsTo, double score) |
29 | 31 | : this(sen, doc, startPos, startPos, bunsetsuPos, dependsAs, dependsTo, score) |
@@ -112,25 +112,25 @@ | ||
112 | 112 | } |
113 | 113 | |
114 | 114 | // 現在のChar Position (Document毎のPos) |
115 | - private int m_CurCharPos; | |
115 | + private protected int m_CurCharPos; | |
116 | 116 | // 現在のSentence番号(通しIDおよびDocument毎のPos) |
117 | - private int m_CurSenID; | |
117 | + private protected int m_CurSenID; | |
118 | 118 | // 現在のDocument |
119 | - private Document m_CurDoc; | |
119 | + private protected Document m_CurDoc; | |
120 | 120 | // 現在の文 |
121 | - private Sentence m_CurSen; | |
121 | + private protected Sentence m_CurSen; | |
122 | 122 | // 文節データの一時リスト |
123 | - private CabochaBunsetsuList m_BunsetsuList; | |
123 | + private protected CabochaBunsetsuList m_BunsetsuList; | |
124 | 124 | // 現在(最後に読み込んだ)文節 |
125 | - private CabochaBunsetsu m_CurBunsetsu; | |
125 | + private protected CabochaBunsetsu m_CurBunsetsu; | |
126 | 126 | // 現在の文内において、係り先が-1であるような文節のリスト |
127 | - private List<CabochaBunsetsu> m_CurTerminalBunsetsu; | |
127 | + private protected List<CabochaBunsetsu> m_CurTerminalBunsetsu; | |
128 | 128 | // Document全体の平文内容 |
129 | - private StringBuilder m_DocumentTextBuilder; | |
129 | + private protected StringBuilder m_DocumentTextBuilder; | |
130 | 130 | // 今読んでいるSentenceに付属するBunsetsu以外のSegmentリスト |
131 | - private List<Segment> m_CurSegments; | |
131 | + private protected List<Segment> m_CurSegments; | |
132 | 132 | // 現在の複合語Chunk |
133 | - private CompositeWordChunk m_CompositeWordChunk; | |
133 | + private protected CompositeWordChunk m_CompositeWordChunk; | |
134 | 134 | |
135 | 135 | /// <summary> |
136 | 136 | /// データをデータをストリームからDocumentに読み込む。 |
@@ -299,7 +299,7 @@ | ||
299 | 299 | } |
300 | 300 | else if (s.StartsWith("#")) |
301 | 301 | { |
302 | - // Ignore | |
302 | + ProcessCommentLine(s); | |
303 | 303 | } |
304 | 304 | else if (s.Trim().Length > 0) |
305 | 305 | { |
@@ -380,8 +380,20 @@ | ||
380 | 380 | seg.Doc = buns.Doc; |
381 | 381 | seg.Sentence = buns.Sen; |
382 | 382 | seg.Version = TagSet.CurrentVersion; |
383 | + seg.Comment = buns.Comment; | |
383 | 384 | m_Corpus.AddSegment(seg); |
384 | 385 | buns.Seg = seg; |
386 | + string uniDicLemma = null; | |
387 | + foreach (var pair in buns.Attrs) | |
388 | + { | |
389 | + seg.Attributes.Add(new SegmentAttribute() { | |
390 | + Proj = seg.Proj, | |
391 | + Target = seg, | |
392 | + User = seg.User, | |
393 | + Version = seg.Version, | |
394 | + Key = pair.Key, | |
395 | + Value = pair.Value }); | |
396 | + } | |
385 | 397 | foreach (var w in buns.Words) |
386 | 398 | { |
387 | 399 | w.Bunsetsu = seg; |
@@ -420,8 +432,13 @@ | ||
420 | 432 | Console.WriteLine("> {0} Links.", m_BunsetsuList.Count); |
421 | 433 | } |
422 | 434 | |
435 | + protected virtual void ProcessCommentLine(string s) | |
436 | + { | |
437 | + // do nothing for Comment line (default). | |
438 | + } | |
439 | + | |
423 | 440 | // 各単語に1文節を割り当て、係り受けを付与する |
424 | - private void ProcessOneLine_1(string s, string[] fields) | |
441 | + protected virtual void ProcessOneLine_1(string s, string[] fields) | |
425 | 442 | { |
426 | 443 | var originalSurface = fields[1]; |
427 | 444 | Lexeme m = null; |
@@ -498,7 +515,7 @@ | ||
498 | 515 | |
499 | 516 | // FEATSフィールドの IOB2 タグに応じて複数の行に1語を割り当て、さらにその1語に1文節と係り受けを付与する. |
500 | 517 | // 複合語内の係り受けは破棄する. |
501 | - private void ProcessOneLine_2(string s, string[] fields) | |
518 | + protected virtual void ProcessOneLine_2(string s, string[] fields) | |
502 | 519 | { |
503 | 520 | var feats = new string[0]; |
504 | 521 | if (fields.Length > 5 && fields[5] != "_") |
@@ -9,98 +9,21 @@ | ||
9 | 9 | |
10 | 10 | namespace ChaKi.Service.Readers |
11 | 11 | { |
12 | - public class ConllUReader : CorpusSourceReader | |
12 | + public class ConllUReader : ConllReader | |
13 | 13 | { |
14 | - private static TagSet DefaultTagSet; | |
15 | - | |
16 | - protected Corpus m_Corpus; | |
17 | - | |
18 | - static ConllUReader() | |
19 | - { | |
20 | - // CONLL用のデフォルトTagSet | |
21 | - DefaultTagSet = new TagSet("CabochaTagSet"); | |
22 | - DefaultTagSet.AddTag(new Tag(Tag.SEGMENT, "Bunsetsu")); | |
23 | - | |
24 | - DefaultTagSet.AddVersion(new TagSetVersion("1", 0, true)); | |
25 | - | |
26 | - CabochaReader.TagSet = null; | |
27 | - } | |
28 | - | |
29 | 14 | public ConllUReader(Corpus corpus, LexiconBuilder lb) |
15 | + : base(corpus, lb) | |
30 | 16 | { |
31 | - throw new NotImplementedException("CONLLU format is not fully implemented yet."); | |
32 | - m_Corpus = corpus; | |
33 | - this.LexiconBuilder = lb; | |
34 | - | |
35 | - // TagSetの初期値はCabocha Defaultとし、インポート中に出現したTagを随時加える. | |
36 | - // 複数ファイルインポートの場合、ファイルを読み込むたびにここを通るが、 | |
37 | - // そのたびごとにstaticなTagSetをリセットすると、Segment, Link等から参照しているTagが | |
38 | - // 次々と異なるものになってしまうので、既にTagSetに存在しているものを変更しないようにする. | |
39 | - if (CabochaReader.TagSet == null) | |
40 | - { | |
41 | - CabochaReader.TagSet = new TagSet(); | |
42 | - } | |
43 | - CabochaReader.TagSet.MergeWith(DefaultTagSet); | |
44 | 17 | } |
45 | 18 | |
46 | - public LexiconBuilder LexiconBuilder { get; set; } // 原始Lexicon | |
47 | - | |
48 | - public string EncodingToUse { get; set; } | |
49 | - | |
50 | - public bool FromDictionary { get; set; } // 辞書ファイルを読み込む場合はTrue; テキストを読み込む場合はFalse | |
51 | - private static TagSet TagSet => CabochaReader.TagSet; | |
52 | - | |
53 | - /// <summary> | |
54 | - /// CONLLデータをCorpusに読み込む(係り受け情報はSegment,Linkテーブルに入れる) | |
55 | - /// </summary> | |
56 | - /// <param name="path"></param> | |
57 | - /// <param name="encoding"></param> | |
58 | - public Document ReadFromFileSLA(string path, string encoding) | |
59 | - { | |
60 | - var newdoc = new Document(); | |
61 | - newdoc.FileName = path; | |
62 | - using (var streamReader = new StreamReader(path, Encoding.GetEncoding(encoding))) | |
63 | - { | |
64 | - ReadFromStreamSLA(streamReader, -1, newdoc); | |
65 | - } | |
66 | - return newdoc; | |
67 | - } | |
68 | - | |
69 | - [Obsolete] | |
70 | - public Document ReadFromFile(string path, string encoding) | |
71 | - { | |
72 | - throw new System.NotImplementedException(); | |
73 | - } | |
74 | - | |
75 | - // 現在のDocument | |
76 | - private Document m_CurDoc; | |
77 | - // 現在の文 | |
78 | - private Sentence m_CurSen; | |
79 | - // 現在のSentence番号(通しIDおよびDocument毎のPos) | |
80 | - private int m_CurSenID; | |
81 | - // 現在のChar Position (Document毎のPos) | |
82 | - private int m_CurCharPos; | |
83 | - // 文節データの一時リスト | |
84 | - private CabochaBunsetsuList m_BunsetsuList; | |
85 | - // 現在(最後に読み込んだ)文節 | |
86 | - private CabochaBunsetsu m_CurBunsetsu; | |
87 | - // 現在の文内において、係り先が-1であるような文節のリスト | |
88 | - private List<CabochaBunsetsu> m_CurTerminalBunsetsu; | |
89 | - // 文のDocument単位の出現リスト | |
90 | - private List<Sentence> m_SentencesInDoc; | |
91 | - // 現在の複合語Chunk | |
92 | - private CompositeWordChunk m_CompositeWordChunk; | |
93 | - // Document全体の平文内容 | |
94 | - private StringBuilder m_DocumentTextBuilder; | |
95 | - | |
96 | 19 | // 各単語に1文節を割り当て、係り受けを付与する |
97 | - private void ProcessOneLine_1(string s, string[] fields) | |
20 | + protected override void ProcessOneLine_1(string s, string[] fields) | |
98 | 21 | { |
99 | 22 | var originalSurface = fields[1]; |
100 | 23 | Lexeme m = null; |
101 | 24 | try |
102 | 25 | { |
103 | - m = this.LexiconBuilder.AddEntryConll(s, this.FromDictionary, false); | |
26 | + m = this.LexiconBuilder.AddEntryConllU(s, this.FromDictionary, false); | |
104 | 27 | } |
105 | 28 | catch (Exception) |
106 | 29 | { |
@@ -119,6 +42,27 @@ | ||
119 | 42 | } |
120 | 43 | var buns = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurCharPos, f0, f7, f6, 0.0); |
121 | 44 | buns.EndPos = buns.StartPos + originalSurface.Length; |
45 | + if (fields.Length > 9 && fields[9] != "_") | |
46 | + { | |
47 | + foreach (var pairs in fields[9].Split('|')) | |
48 | + { | |
49 | + var pair = pairs.Split('='); | |
50 | + if (pair.Length == 2 && !buns.Attrs.ContainsKey(pair[0])) | |
51 | + { | |
52 | + if (pair[0] == "UniDicLemma") | |
53 | + { | |
54 | + // UniDicLemma属性はlexemeに持たせる. | |
55 | + m.CustomProperty = $"{pair[0]}\t{pair[1]}"; | |
56 | + } | |
57 | + else | |
58 | + { | |
59 | + // それ以外はBunsetsu SegmentのAttributeに持たせる. | |
60 | + buns.Attrs.Add(pair[0], pair[1]); | |
61 | + } | |
62 | + } | |
63 | + | |
64 | + } | |
65 | + } | |
122 | 66 | m_CurBunsetsu = buns; |
123 | 67 | if (buns.DependsTo == -1) |
124 | 68 | { |
@@ -130,35 +74,16 @@ | ||
130 | 74 | { |
131 | 75 | Console.WriteLine(string.Format("Bunsetsu parse error: {0}", s)); |
132 | 76 | } |
133 | - Word w = null; | |
134 | - var feats = new string[0]; | |
135 | - if (fields.Length > 5 && fields[5] != "_") | |
77 | + // SentenceとBunsetsuにWordを追加. | |
78 | + var w = m_CurSen.AddWord(m); | |
79 | + w.StartChar = m_CurCharPos; | |
80 | + w.EndChar = m_CurCharPos + originalSurface.Length; // Word.Lengthを使ってもよいが、空白を含む文字列長であることに注意. | |
81 | + // Surfaceの末尾の空白をWordに記録 | |
82 | + if (!originalSurface.Equals(m.Surface)) | |
136 | 83 | { |
137 | - feats = fields[5].Split('|'); | |
84 | + w.Extras = GetDiff(originalSurface, m.Surface); | |
138 | 85 | } |
139 | - if (feats.Contains("SP")) | |
140 | - { | |
141 | - // FEATURESで空白を指定している場合 | |
142 | - // SentenceとBunsetsuにWordを追加. | |
143 | - w = m_CurSen.AddWord(m); | |
144 | - w.StartChar = m_CurCharPos; | |
145 | - w.EndChar = m_CurCharPos + w.CharLength + 1; | |
146 | - w.Extras = " "; | |
147 | - m_DocumentTextBuilder.Append(originalSurface + w.Extras); | |
148 | - } | |
149 | - else | |
150 | - { | |
151 | - // SentenceとBunsetsuにWordを追加. | |
152 | - w = m_CurSen.AddWord(m); | |
153 | - w.StartChar = m_CurCharPos; | |
154 | - w.EndChar = m_CurCharPos + originalSurface.Length; // Word.Lengthを使ってもよいが、空白を含む文字列長であることに注意. | |
155 | - // Surfaceの末尾の空白をWordに記録 | |
156 | - //if (!originalSurface.Equals(m.Surface)) | |
157 | - //{ | |
158 | - // w.Extras = GetDiff(originalSurface, m.Surface); | |
159 | - //} | |
160 | - m_DocumentTextBuilder.Append(originalSurface); | |
161 | - } | |
86 | + m_DocumentTextBuilder.Append(originalSurface); | |
162 | 87 | |
163 | 88 | // 文節にこの語を割り当てる |
164 | 89 | if (m_CurBunsetsu != null) |
@@ -169,260 +94,31 @@ | ||
169 | 94 | } |
170 | 95 | } |
171 | 96 | |
172 | - public void ReadFromStreamSLA(TextReader rdr, int sentenceCount, Document doc) | |
97 | + // CONLLUではFEATSフィールドの IOB2 タグは見ない. | |
98 | + protected override void ProcessOneLine_2(string s, string[] fields) | |
173 | 99 | { |
174 | - this.FromDictionary = false; | |
175 | - | |
176 | - string s; | |
177 | - | |
178 | - m_CurDoc = doc; | |
179 | - m_BunsetsuList = new CabochaBunsetsuList(); | |
180 | - m_CurCharPos = 0; | |
181 | - m_CurSenID = 0; | |
182 | - m_CurSen = new Sentence(m_CurDoc) { ID = m_CurSenID++, Pos = 0 }; | |
183 | - m_CurBunsetsu = null; // 最後に読んだ文節 | |
184 | - m_CurTerminalBunsetsu = new List<CabochaBunsetsu>(); // 現在の文内において、係り先が-1であるような文節 | |
185 | - //m_CurSegments = new List<Segment>(); // 今読んでいるSentenceに付属するBunsetsu以外のSegmentリスト | |
186 | - m_DocumentTextBuilder = new StringBuilder(); | |
187 | - object lastAnnotationTag = m_CurDoc; // 現在の文内において、最後に読んだDocumentまたはSegment(文節以外)またはLinkまたはGroup | |
188 | - m_SentencesInDoc = new List<Sentence>(); | |
189 | - //m_SegmentsInDoc = new List<Segment>(); | |
190 | - m_CompositeWordChunk = new CompositeWordChunk(); | |
191 | - | |
192 | - var currentComposite = string.Empty; | |
193 | - | |
194 | - int n = 0; | |
195 | - while (true) | |
196 | - { | |
197 | - s = rdr.ReadLine(); | |
198 | - if (s == null) | |
199 | - { | |
200 | - if (m_CurSen.Words.Count > 0) // 正常なファイル末尾なら、m_CurSenはクリアされているはず。 | |
201 | - { | |
202 | - s = string.Empty;// ファイル末尾にEOS行(空行)がないので、空行の存在をシミュレートする. | |
203 | - } | |
204 | - else | |
205 | - { | |
206 | - break; | |
207 | - } | |
208 | - } | |
209 | - s = Cleanup(s); // ファイル途中のBOM(catした時に残っている場合がある)を削除する | |
210 | - | |
211 | - if (s.StartsWith("#")) | |
212 | - { | |
213 | - // Ignore | |
214 | - } | |
215 | - else if (s.Trim().Length > 0) | |
216 | - { | |
217 | - // 語を表す1行を処理する. | |
218 | - var fields = s.Split('\t'); | |
219 | - try | |
220 | - { | |
221 | - ProcessOneLine_2(s, fields); | |
222 | - } | |
223 | - catch (Exception ex) | |
224 | - { | |
225 | - Console.WriteLine("At line {0}: Error: {1}", n, ex.Message); | |
226 | - } | |
227 | - } | |
228 | - else | |
229 | - { | |
230 | - // CONLL: 空行は文の終わり | |
231 | - if (m_CurBunsetsu == null) | |
232 | - { // デフォルト文節を追加(入力がChasen/Mecabの場合のため) | |
233 | - var buns = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurSen.StartChar, 0, String.Empty, -1, 0.0); | |
234 | - buns.EndPos = m_CurCharPos; | |
235 | - m_BunsetsuList.Add(buns); | |
236 | - m_CurBunsetsu = buns; | |
237 | - m_CurTerminalBunsetsu.Add(buns); | |
238 | - } | |
239 | - // 終端ダミー文節を追加 | |
240 | - var dummy = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurCharPos, m_CurBunsetsu.BunsetsuPos + 1, String.Empty, -1, 0.0); | |
241 | - m_BunsetsuList.Add(dummy); | |
242 | - // 係り先が-1である文節をdummyに係るようにする。 | |
243 | - if (m_CurTerminalBunsetsu != null && m_CurTerminalBunsetsu.Count > 0) | |
244 | - { | |
245 | - foreach (var buns in m_CurTerminalBunsetsu) | |
246 | - { | |
247 | - buns.DependsTo = dummy.BunsetsuPos; | |
248 | - } | |
249 | - } | |
250 | - | |
251 | - if (++n % 1000 == 0) | |
252 | - { | |
253 | - Console.Write("> {0} Sentences.\r", n); | |
254 | - } | |
255 | - m_CurSen.EndChar = m_CurCharPos; | |
256 | - | |
257 | - m_SentencesInDoc.Add(m_CurSen); | |
258 | - m_Corpus.AddSentence(m_CurSen); | |
259 | - if (sentenceCount > 0 && n >= sentenceCount) | |
260 | - { | |
261 | - break; | |
262 | - } | |
263 | - // 以降のWordのために、新しいSentenceを初期化して用意する。 | |
264 | - int lastsenpos = m_CurSen.Pos + 1; | |
265 | - m_CurSen = new Sentence(m_CurDoc) { ID = m_CurSenID++, Pos = lastsenpos }; | |
266 | - m_CurSen.StartChar = m_CurCharPos; | |
267 | - m_CurBunsetsu = null; | |
268 | - m_CurTerminalBunsetsu.Clear(); | |
269 | - //m_CurSegments = new List<Segment>(); | |
270 | - m_CompositeWordChunk.Clear(); | |
271 | - lastAnnotationTag = null; | |
272 | - } | |
273 | - } | |
274 | - m_CurDoc.Text = m_DocumentTextBuilder.ToString(); | |
275 | - Console.Write("> {0} Sentences.\r", n); | |
276 | - | |
277 | - Console.WriteLine(); | |
278 | - // BunsetsuをSegment&LinkとしてCorpusに登録 | |
279 | - var bunsetsuTag = TagSet.FindTag(Tag.SEGMENT, "Bunsetsu"); | |
280 | - n = 0; | |
281 | - foreach (var buns in m_BunsetsuList.Values) | |
282 | - { | |
283 | - if (++n % 100 == 0) | |
284 | - { | |
285 | - Console.Write("> {0} Segments.\r", n); | |
286 | - } | |
287 | - var seg = new Segment(); | |
288 | - seg.StartChar = buns.StartPos; | |
289 | - seg.EndChar = buns.EndPos; | |
290 | - seg.Tag = bunsetsuTag; | |
291 | - seg.Doc = buns.Doc; | |
292 | - seg.Sentence = buns.Sen; | |
293 | - seg.Version = TagSet.CurrentVersion; | |
294 | - m_Corpus.AddSegment(seg); | |
295 | - buns.Seg = seg; | |
296 | - foreach (var w in buns.Words) | |
297 | - { | |
298 | - w.Bunsetsu = seg; | |
299 | - } | |
300 | - } | |
301 | - Console.WriteLine("> {0} Segments.", m_BunsetsuList.Count); | |
302 | - n = 0; | |
303 | - foreach (var buns in m_BunsetsuList.Values) | |
304 | - { | |
305 | - if (++n % 100 == 0) | |
306 | - { | |
307 | - Console.Write("> {0} Links.\r", n); | |
308 | - } | |
309 | - var depBunsetsu = m_BunsetsuList.Find(buns.Sen, buns.DependsTo); | |
310 | - if (depBunsetsu != null) | |
311 | - { | |
312 | - var link = new Link(); | |
313 | - link.From = buns.Seg; | |
314 | - link.To = depBunsetsu.Seg; | |
315 | - link.FromSentence = buns.Sen; | |
316 | - link.ToSentence = buns.Sen; | |
317 | - link.Tag = TagSet.FindOrAddTag(Tag.LINK, buns.DependsAs); | |
318 | - link.Version = TagSet.CurrentVersion; | |
319 | - link.Attributes.Add(new LinkAttribute() | |
320 | - { | |
321 | - Proj = link.Proj, | |
322 | - Target = link, | |
323 | - User = link.User, | |
324 | - Version = link.Version, | |
325 | - Key = "Score", | |
326 | - Value = buns.Score.ToString() | |
327 | - }); | |
328 | - m_Corpus.AddLink(link); | |
329 | - } | |
330 | - } | |
331 | - Console.WriteLine("> {0} Links.", m_BunsetsuList.Count); | |
100 | + ProcessOneLine_1(s, fields); | |
332 | 101 | } |
333 | 102 | |
334 | - // FEATSフィールドの IOB2 タグに応じて複数の行に1語を割り当て、さらにその1語に1文節と係り受けを付与する. | |
335 | - // 複合語内の係り受けは破棄する. | |
336 | - private void ProcessOneLine_2(string s, string[] fields) | |
103 | + protected override void ProcessCommentLine(string s) | |
337 | 104 | { |
338 | - var feats = new string[0]; | |
339 | - if (fields.Length > 5 && fields[5] != "_") | |
105 | + // コメント行にあるsent_idをSentence属性に格納 | |
106 | + if (s.StartsWith("# sent_id = ")) | |
340 | 107 | { |
341 | - feats = fields[5].Split('|'); | |
342 | - } | |
343 | - var btag = feats.FirstOrDefault(f => f.StartsWith("B-")); // 複数のBタグがあった場合、最初のもののみ有効. | |
344 | - var itag = feats.FirstOrDefault(f => f.StartsWith("I-")); | |
345 | - | |
346 | - // 複合語の終了か | |
347 | - if (itag == null) | |
348 | - { | |
349 | - // 現在までに収集したCompositeWordChunkがあれば先に出力 | |
350 | - if (!m_CompositeWordChunk.IsEmpty()) | |
108 | + // SentenceAttrはDocumentAttrへのリンクになる. | |
109 | + var attrid = m_CurDoc.Attributes.Count; | |
110 | + m_CurDoc.Attributes.Add(new DocumentAttribute() | |
351 | 111 | { |
352 | - try | |
353 | - { | |
354 | - var scw = m_CompositeWordChunk.ToConllSingleLine(); | |
355 | - var scwf = scw.Split('\t'); | |
356 | - ProcessOneLine_1(scw, scwf); | |
357 | - } | |
358 | - finally | |
359 | - { | |
360 | - m_CompositeWordChunk.Clear(); | |
361 | - } | |
362 | - } | |
363 | - } | |
364 | - if (btag != null) // BならChunkを初期化 | |
365 | - { | |
366 | - m_CompositeWordChunk.Clear(); | |
367 | - m_CompositeWordChunk.ChunkPOS = btag.Substring(2); | |
368 | - } | |
369 | - if (btag != null || itag != null) // BまたはI | |
370 | - { | |
371 | - // 現在の語をCompositeWordChunkに追加し、出力は行わない. | |
372 | - var f0 = Int32.Parse(fields[0]) - 1; | |
373 | - var f5 = (fields.Length > 5) ? fields[5] : "_"; | |
374 | - var f6 = f0 + 1; | |
375 | - if (fields.Length > 6 && fields[6] != "_") | |
112 | + ID = attrid, | |
113 | + Key = "@sent_id", | |
114 | + Value = s.Substring(13) | |
115 | + }); | |
116 | + m_CurSen.Attributes.Add(new SentenceAttribute() | |
376 | 117 | { |
377 | - f6 = Int32.Parse(fields[6]) - 1; | |
378 | - } | |
379 | - var f7 = (fields.Length > 7) ? fields[7] : string.Empty; | |
380 | - m_CompositeWordChunk.Add(f0, fields[1], fields[2], f5, f6, f7); | |
118 | + ID = attrid | |
119 | + }); | |
381 | 120 | } |
382 | - else // BでもIでもない | |
383 | - { | |
384 | - // 現在の行を出力 | |
385 | - ProcessOneLine_1(s, fields); | |
386 | - } | |
387 | 121 | } |
388 | 122 | |
389 | - public void ReadLexiconFromStream(TextReader rdr, bool baseOnly) | |
390 | - { | |
391 | - throw new System.NotImplementedException(); | |
392 | - } | |
393 | - | |
394 | - public void ReadLexiconFromStream(TextReader rdr) | |
395 | - { | |
396 | - throw new System.NotImplementedException(); | |
397 | - } | |
398 | - | |
399 | - public void SetFieldDefs(Field[] fieldDefs) | |
400 | - { | |
401 | - throw new System.NotImplementedException(); | |
402 | - } | |
403 | - | |
404 | - /// <summary> | |
405 | - /// 行頭のBOMを除去する | |
406 | - /// </summary> | |
407 | - /// <param name="input"></param> | |
408 | - /// <returns></returns> | |
409 | - public static string Cleanup(string input) | |
410 | - { | |
411 | - var sb = new StringBuilder(); | |
412 | - var firstChar = true; | |
413 | - foreach (var c in input) | |
414 | - { | |
415 | - if (c != 0xFEFF) | |
416 | - { | |
417 | - sb.Append(c); | |
418 | - if (firstChar) | |
419 | - { | |
420 | - return input; | |
421 | - } | |
422 | - } | |
423 | - firstChar = false; | |
424 | - } | |
425 | - return sb.ToString(); | |
426 | - } | |
427 | 123 | } |
428 | 124 | } |
@@ -639,5 +639,54 @@ | ||
639 | 639 | |
640 | 640 | return ret; |
641 | 641 | } |
642 | + | |
643 | + public Lexeme AddEntryConllU(string s, bool fromDictionary, bool baseOnly) | |
644 | + { | |
645 | + string[] props = SplitConllUFormat(s); | |
646 | + if (fromDictionary) | |
647 | + { | |
648 | + return AddEntrySimple(props, baseOnly); | |
649 | + } | |
650 | + else | |
651 | + { | |
652 | + return AddEntry(props); | |
653 | + } | |
654 | + } | |
655 | + | |
656 | + public string[] SplitConllUFormat(string s) | |
657 | + { | |
658 | + var lex = new Lexeme(); | |
659 | + | |
660 | + var fields = s.Split('\t'); | |
661 | + if (fields.Length < 4) | |
662 | + { | |
663 | + throw new Exception(string.Format("Mismatch in field count. Required=4~10; Seen={0}", fields.Length)); | |
664 | + } | |
665 | + var ret = new string[10] { string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty }; | |
666 | + // 0: ID in a sentence | |
667 | + // 1: Surface Form | |
668 | + ret[0] = fields[1].Trim(); // ここで末尾の空白を取る。インポート時に削除するのは原則末尾の空白のみ。 | |
669 | + // 2: Lemma Form | |
670 | + if (!fields[2].Equals("_")) | |
671 | + { | |
672 | + ret[5] = fields[2]; | |
673 | + } | |
674 | + // 3-4: POS | |
675 | + if (!fields[3].Equals("_")) | |
676 | + { | |
677 | + ret[6] = fields[3]; | |
678 | + } | |
679 | + if (fields.Length > 4 && !fields[4].Equals("_")) | |
680 | + { | |
681 | + ret[6] = string.Format("{0}-{1}", ret[6], fields[4]); | |
682 | + } | |
683 | + // 5: Features | |
684 | + // 6: Head | |
685 | + // 7: DepRel | |
686 | + // 8: PHead | |
687 | + // 9: PDepRel | |
688 | + | |
689 | + return ret; | |
690 | + } | |
642 | 691 | } |
643 | 692 | } |
@@ -12,5 +12,5 @@ | ||
12 | 12 | [assembly: AssemblyCulture("")] |
13 | 13 | [assembly: ComVisible(false)] |
14 | 14 | [assembly: Guid("a8cf8403-eb88-418f-bf54-56aeaef39268")] |
15 | -[assembly: AssemblyVersion("3.14.642.0")] | |
16 | -[assembly: AssemblyFileVersion("3.14.642.0")] | |
15 | +[assembly: AssemblyVersion("3.14.648.0")] | |
16 | +[assembly: AssemblyFileVersion("3.14.648.0")] |
@@ -12,5 +12,5 @@ | ||
12 | 12 | [assembly: AssemblyCulture("")] |
13 | 13 | [assembly: ComVisible(false)] |
14 | 14 | [assembly: Guid("ff6652ed-b932-466b-944b-ce88d698979b")] |
15 | -[assembly: AssemblyVersion("3.14.642.0")] | |
16 | -[assembly: AssemblyFileVersion("3.14.642.0")] | |
15 | +[assembly: AssemblyVersion("3.14.648.0")] | |
16 | +[assembly: AssemblyFileVersion("3.14.648.0")] |