• R/O
  • SSH
  • HTTPS

chaki: Commit


Commit MetaInfo

Revision649 (tree)
Time2021-01-12 20:43:01
Authortomorita

Log Message

CONLLU import/export実装完了
DOC/SENTENCETAGの仕様を変更(Key名が"@"で始まるものはSentence専用のattributeとする)
Export処理でprojidを指定できるよう修正(途中)

Change Summary

Incremental Difference

--- trunk/ChaKi.NET/src/ChaKi.NET/Panels/AttributeListPanel.cs (revision 648)
+++ trunk/ChaKi.NET/src/ChaKi.NET/Panels/AttributeListPanel.cs (revision 649)
@@ -161,7 +161,10 @@
161161 rows.Add(new AttributeData("ID", sen.ParentDoc.ID.ToString(), AttributeGridRowType.ReadOnly));
162162 foreach (DocumentAttribute a in sen.ParentDoc.Attributes)
163163 {
164- rows.Add(new AttributeData(a.Key, a.Value, AttributeGridRowType.KeyValueWritable));
164+ if (!a.Key.StartsWith("@"))
165+ {
166+ rows.Add(new AttributeData(a.Key, a.Value, AttributeGridRowType.KeyValueWritable));
167+ }
165168 }
166169 // --> ここだけ変則的。呼び出し元のSetSource()に存在すべきものである。Sentence/Documentについても、sourceをobject[]にすればすっきりする(TODO)。
167170 if (this.attributeGrid1.IsEditing)
--- trunk/ChaKi.NET/src/ChaKi.NET/Panels/ConditionsPanes/CorpusPane.cs (revision 648)
+++ trunk/ChaKi.NET/src/ChaKi.NET/Panels/ConditionsPanes/CorpusPane.cs (revision 649)
@@ -499,6 +499,10 @@
499499 {
500500 filterStr.AppendFormat("{0}|*.conll|", def.Name);
501501 }
502+ else if (def.Name == "CONLLU")
503+ {
504+ filterStr.AppendFormat("{0}|*.conllu|", def.Name);
505+ }
502506 else //TODO: とりあえずテキストエクスポートは選択できないようにしておく
503507 {
504508 continue;
@@ -554,6 +558,10 @@
554558 {
555559 svc = new ExportServiceConll(wr);
556560 }
561+ else if (def.Name == "CONLLU")
562+ {
563+ svc = new ExportServiceConllU(wr);
564+ }
557565 else
558566 {
559567 svc = new ExportServiceCabocha(wr, def);
--- trunk/ChaKi.NET/src/ChaKi.NET/ToolDialogs/CreateMySQLCorpus.cs (revision 648)
+++ trunk/ChaKi.NET/src/ChaKi.NET/ToolDialogs/CreateMySQLCorpus.cs (revision 649)
@@ -51,7 +51,7 @@
5151 {
5252 OpenFileFolderDialog dlg = new OpenFileFolderDialog();
5353 dlg.Title = "Select Input File/Folder";
54- dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*";
54+ dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|CONLLU files (*.conllu)|*.conllu|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*";
5555 if (ImportFileLastSelectedFilterIndex > 0)
5656 {
5757 dlg.FilterIndex = ImportFileLastSelectedFilterIndex;
--- trunk/ChaKi.NET/src/ChaKi.NET/ToolDialogs/CreateSQLiteCorpus.cs (revision 648)
+++ trunk/ChaKi.NET/src/ChaKi.NET/ToolDialogs/CreateSQLiteCorpus.cs (revision 649)
@@ -101,7 +101,7 @@
101101 {
102102 OpenFileFolderDialog dlg = new OpenFileFolderDialog();
103103 dlg.Title = "Select Input File/Folder";
104- dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*";
104+ dlg.FilterSpec = "Cabocha files (*.cabocha)|*.cabocha|ChaSen files (*.chasen)|*.chasen|MeCab files (*.mecab)|*.mecab|CONLL files (*.conll)|*.conll|CONLLU files (*.conllu)|*.conllu|Text files (*.txt)|*.txt|All files (*.*)|*.*|Folders|*.*";
105105 dlg.FileMustExist = true;
106106 if (ImportFileLastSelectedFilterIndex > 0)
107107 {
--- trunk/ChaKi.NET/src/Entity/Corpora/Annotations/AttributeBase32.cs (revision 648)
+++ trunk/ChaKi.NET/src/Entity/Corpora/Annotations/AttributeBase32.cs (revision 649)
@@ -25,7 +25,14 @@
2525 {
2626 using (XmlWriter wr = new XmlTextWriter(twr))
2727 {
28- wr.WriteElementString(this.Key, this.Value);
28+ if (this.Key.StartsWith("@"))
29+ {
30+ wr.WriteElementString(this.Key.Substring(1), this.Value);
31+ }
32+ else
33+ {
34+ wr.WriteElementString(this.Key, this.Value);
35+ }
2936 return twr.ToString();
3037 }
3138 }
--- trunk/ChaKi.NET/src/Entity/Corpora/Document.cs (revision 648)
+++ trunk/ChaKi.NET/src/Entity/Corpora/Document.cs (revision 649)
@@ -22,6 +22,15 @@
2222 /// </summary>
2323 public virtual string Text { get; set; }
2424
25+ /// <summary>
26+ /// 2021.1.11仕様追加
27+ /// 従来より、DocumentAttributeの一部はSentenceAttributeとして使われている.
28+ /// (Sentence自体はDocumentAttributeへのIDしか持てない.)
29+ /// SentenceAttribute専用のAttributeを区別するため、Keyの先頭に"@"を付与するものとする.
30+ /// Export時に、
31+ /// @付きのAttributeは、SENTENCETAGIDとしてのみ出力される.
32+ /// @なしのAttributeは、DOCIDおよびDocumentの#ATTRタグに出力される.
33+ /// </summary>
2534 public virtual ISet<DocumentAttribute> Attributes { get; set; }
2635
2736 public Document()
@@ -76,7 +85,10 @@
7685 {
7786 foreach (DocumentAttribute attr in this.Attributes)
7887 {
79- wr.WriteElementString(attr.Key, attr.Value);
88+ if (!attr.Key.StartsWith("@"))
89+ {
90+ wr.WriteElementString(attr.Key, attr.Value);
91+ }
8092 }
8193 return twr.ToString();
8294 }
--- trunk/ChaKi.NET/src/ImportWordRelation/Properties/AssemblyInfo.cs (revision 648)
+++ trunk/ChaKi.NET/src/ImportWordRelation/Properties/AssemblyInfo.cs (revision 649)
@@ -12,5 +12,5 @@
1212 [assembly: AssemblyCulture("")]
1313 [assembly: ComVisible(false)]
1414 [assembly: Guid("6a95808a-d1e3-47de-bb62-7ed7a281ac0b")]
15-[assembly: AssemblyVersion("3.14.642.0")]
16-[assembly: AssemblyFileVersion("3.14.642.0")]
15+[assembly: AssemblyVersion("3.14.648.0")]
16+[assembly: AssemblyFileVersion("3.14.648.0")]
--- trunk/ChaKi.NET/src/Service/Common/Util.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Common/Util.cs (revision 649)
@@ -114,7 +114,7 @@
114114 //}
115115
116116 // Groupは直接文と関連ついていないので、segment->group_member->groupと順にたどって、文に付いているGroupを求める.
117- public static IList<Group> RetrieveWordGroups(ISession sess, Sentence sen)
117+ public static IList<Group> RetrieveWordGroups(ISession sess, Sentence sen, int projid)
118118 {
119119 var sgtags = GetSegmentTagIDForGroups(sess); // これはSegmentのタグで、Groupに関係あるもの.
120120 if (sgtags.Count == 0)
@@ -131,9 +131,10 @@
131131 List<long> segs = new List<long>();
132132 using (var cmd = sess.Connection.CreateCommand())
133133 {
134- cmd.CommandText = string.Format("SELECT id from segment"
135- + " WHERE document_id={0} AND sentence_id={1} AND tag_definition_id IN {2}",
136- sen.ParentDoc.ID, sen.ID, BuildIDList(sgtags));
134+ cmd.CommandText = "SELECT id from segment" +
135+ $" WHERE document_id={sen.ParentDoc.ID} AND sentence_id={sen.ID}" +
136+ $" AND tag_definition_id IN {BuildIDList(sgtags)}" +
137+ $" AND project_id={projid}";
137138 var rdr = cmd.ExecuteReader();
138139 while (rdr.Read())
139140 {
@@ -172,7 +173,7 @@
172173 return q.List<Group>();
173174 }
174175
175- public static IList<Group> RetrieveMWEGroups(ISession sess, Sentence sen)
176+ public static IList<Group> RetrieveMWEGroups(ISession sess, Sentence sen, int projid)
176177 {
177178 if (MweSegTag < 0)
178179 {
@@ -188,7 +189,7 @@
188189 {
189190 cmd.CommandText = "SELECT id from segment"
190191 + $" WHERE document_id={sen.ParentDoc.ID} AND sentence_id={sen.ID} "
191- + $"AND tag_definition_id={MweSegTag}";
192+ + $"AND tag_definition_id={MweSegTag} AND project_id={projid}";
192193 var rdr = cmd.ExecuteReader();
193194 while (rdr.Read())
194195 {
@@ -205,8 +206,9 @@
205206 List<long> groupids = new List<long>();
206207 using (var cmd = sess.Connection.CreateCommand())
207208 {
208- cmd.CommandText = "SELECT DISTINCT group_id FROM group_member"
209- + $" WHERE member_id IN {BuildIDList(segs)} AND object_Type='{Tag.SEGMENT}' ORDER BY group_id ASC";
209+ cmd.CommandText = "SELECT DISTINCT group_id FROM group_member" +
210+ $" WHERE member_id IN {BuildIDList(segs)} " +
211+ $"AND object_Type='{Tag.SEGMENT}' ORDER BY group_id ASC";
210212 var rdr = cmd.ExecuteReader();
211213 while (rdr.Read())
212214 {
@@ -231,17 +233,17 @@
231233 /// <param name="sess"></param>
232234 /// <param name="sen"></param>
233235 /// <returns></returns>
234- public static IList<long> RetrieveMiscSegments(ISession sess, Sentence sen)
236+ public static IList<long> RetrieveMiscSegments(ISession sess, Sentence sen, int projid)
235237 {
236238 var gtag = GetBunsetsuTagId(sess);
237239
238240 using (var cmd = sess.Connection.CreateCommand())
239241 {
240- cmd.CommandText = string.Format(
241- "SELECT s.id FROM segment s"
242- + " INNER JOIN tag_definition t ON s.tag_definition_id=t.id"
243- + " WHERE s.document_id={0} AND s.sentence_id={1} and s.tag_definition_id!={2}",
244- sen.ParentDoc.ID, sen.ID, gtag);
242+ cmd.CommandText =
243+ "SELECT s.id FROM segment s" +
244+ " INNER JOIN tag_definition t ON s.tag_definition_id=t.id" +
245+ $" WHERE s.document_id={sen.ParentDoc.ID} AND s.sentence_id={sen.ID}" +
246+ $" and s.tag_definition_id!={gtag} and s.project_id={projid}";
245247 var rdr = cmd.ExecuteReader();
246248 var result = new List<long>();
247249 while (rdr.Read())
@@ -323,19 +325,19 @@
323325 /// <param name="sess"></param>
324326 /// <param name="sen"></param>
325327 /// <returns></returns>
326- public static IList<long> RetrieveMiscLinks(ISession sess, Sentence sen)
328+ public static IList<long> RetrieveMiscLinks(ISession sess, Sentence sen, int projid)
327329 {
328330 long bunsetsuid = GetBunsetsuTagId(sess);
329331
330332 using (var cmd = sess.Connection.CreateCommand())
331333 {
332- cmd.CommandText = string.Format(
333- "SELECT l.id FROM link l"
334- + " INNER JOIN tag_definition t ON t.id=l.tag_definition_id"
335- + " INNER JOIN segment s1 ON s1.id=l.from_segment_id"
336- + " INNER JOIN segment s2 ON s2.id=l.to_segment_id"
337- + " WHERE l.from_sentence_id={0} AND s1.tag_definition_id !={1} AND s2.tag_definition_id != {1}",
338- sen.ID, bunsetsuid);
334+ cmd.CommandText =
335+ "SELECT l.id FROM link l" +
336+ " INNER JOIN tag_definition t ON t.id=l.tag_definition_id" +
337+ " INNER JOIN segment s1 ON s1.id=l.from_segment_id" +
338+ " INNER JOIN segment s2 ON s2.id=l.to_segment_id" +
339+ $" WHERE l.from_sentence_id={sen.ID} AND s1.tag_definition_id !={bunsetsuid}" +
340+ $" AND s2.tag_definition_id != {1} AND l.project_id={projid}";
339341 var rdr = cmd.ExecuteReader();
340342 var result = new List<long>();
341343 while (rdr.Read())
@@ -373,13 +375,15 @@
373375 /// <param name="startPos"></param>
374376 /// <param name="endPos"></param>
375377 /// <returns></returns>
376- public static IList<Segment> RetrieveBunsetsu(ISession sess, Sentence sen)
378+ public static IList<Segment> RetrieveBunsetsu(ISession sess, Sentence sen, int projid)
377379 {
378380 long bunsetsuid = GetBunsetsuTagId(sess);
379381
380382 var q = sess.CreateQuery(
381- string.Format("from Segment seg where seg.Doc.ID = {0} and seg.Sentence.ID = {1} and seg.Tag.ID = {2} order by seg.StartChar",
382- sen.ParentDoc.ID, sen.ID, bunsetsuid));
383+ $"from Segment seg where seg.Doc.ID = {sen.ParentDoc.ID}" +
384+ $" and seg.Sentence.ID = {sen.ID}" +
385+ $" and seg.Tag.ID = {bunsetsuid}" +
386+ $" and seg.Proj.ID = {projid} order by seg.StartChar");
383387 return q.List<Segment>();
384388 }
385389
--- trunk/ChaKi.NET/src/Service/DependencyEdit/DepEditService.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/DependencyEdit/DepEditService.cs (revision 649)
@@ -386,7 +386,7 @@
386386 {
387387 throw new InvalidOperationException("Unknown sentence.");
388388 }
389- return Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen);
389+ return Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen, m_Context.Proj.ID);
390390 }
391391
392392 public TagSet GetTagSet()
@@ -1208,11 +1208,11 @@
12081208 {
12091209 throw new InvalidOperationException("Unknown sentence.");
12101210 }
1211- var groups = Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen);
1211+ var groups = Util.RetrieveWordGroups(m_Context.Session, m_Context.Sen, m_Context.Proj.ID);
12121212 var mwegroups = from g in groups where g.Tag.Name == "MWE" select g;
12131213
12141214 // この文のBunsetsu Segmentをリスト
1215- var buns = Util.RetrieveBunsetsu(m_Context.Session, m_Context.Sen);
1215+ var buns = Util.RetrieveBunsetsu(m_Context.Session, m_Context.Sen, m_Context.Proj.ID);
12161216 // この文のWordをリスト
12171217 var words = m_Context.Sen.GetWords(m_Context.Proj.ID);
12181218 if (buns.Count() != words.Count() + 1)
--- trunk/ChaKi.NET/src/Service/Export/ExportServiceAnnotation.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Export/ExportServiceAnnotation.cs (revision 649)
@@ -123,7 +123,7 @@
123123 private bool WriteBunsetsuAndAnnotations(Sentence sen)
124124 {
125125 var hasAnnotation = false;
126- var grps = Util.RetrieveWordGroups(m_Session, sen);
126+ var grps = Util.RetrieveWordGroups(m_Session, sen, m_ProjId);
127127
128128 var senOffset = sen.StartChar;
129129 var segno = 0; // 本メソッドローカルのSegment番号(文内)
@@ -183,7 +183,7 @@
183183 }
184184
185185 // Groupの一部およびCabochaで表現される以外のSegmentを出力する
186- var segids = Util.RetrieveMiscSegments(m_Session, sen); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment
186+ var segids = Util.RetrieveMiscSegments(m_Session, sen, m_ProjId); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment
187187 foreach (var segid in segids)
188188 {
189189 if (seg_index.ContainsKey(segid)) // 既にGroupの一部としてExportされたSegmentは除外する.
@@ -209,7 +209,7 @@
209209 }
210210
211211 // Cabochaで表現される以外のLinkを出力する
212- IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen); // fromsegid, tosegid, tagname, comment
212+ IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen, m_ProjId); // fromsegid, tosegid, tagname, comment
213213 foreach (var linkid in linkids)
214214 {
215215 try
--- trunk/ChaKi.NET/src/Service/Export/ExportServiceBase.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Export/ExportServiceBase.cs (revision 649)
@@ -25,6 +25,8 @@
2525 protected HashSet<Namespace> m_Namespaces = new HashSet<Namespace>();
2626 protected ReaderDef m_Def;
2727
28+ protected int m_ProjId = 0;
29+
2830 public void Dispose()
2931 {
3032 Dispose(true);
--- trunk/ChaKi.NET/src/Service/Export/ExportServiceCabocha.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Export/ExportServiceCabocha.cs (revision 649)
@@ -26,6 +26,7 @@
2626 protected int m_CurrentDocId;
2727 protected Action<Lexeme> m_LexemeWriter;
2828 protected Dictionary<Document, int> m_DocOffsetCache;
29+ protected bool m_OutputSentencetagids = true;
2930
3031 public ExportServiceCabocha(TextWriter wr)
3132 {
@@ -117,10 +118,7 @@
117118 foreach (var a in sen.Attributes)
118119 {
119120 string csa = string.Format("{0}:{1}", ki.Crps.Name, a.ID);
120- if (m_SentenceTags.TryGetValue(csa, out pair))
121- {
122- m_TextWriter.WriteLine("#! SENTENCETAG {0}", pair.Seqid);
123- }
121+ WriteSentenceTag(csa);
124122 }
125123
126124 WriteWords(sen.GetWords(0));
@@ -128,6 +126,14 @@
128126 WriteEos();
129127 }
130128
129+ protected virtual void WriteSentenceTag(string csa)
130+ {
131+ if (m_SentenceTags.TryGetValue(csa, out var pair))
132+ {
133+ m_TextWriter.WriteLine("#! SENTENCETAG {0}", pair.Seqid);
134+ }
135+ }
136+
131137 static public long t1 = 0;
132138 static public long t2 = 0;
133139 static public long t3 = 0;
@@ -141,6 +147,8 @@
141147 {
142148 if (m_TextWriter == null) throw new InvalidOperationException("TextWriter is null.");
143149
150+ m_ProjId = project_id;
151+
144152 var t0 = DateTime.Now.Ticks;
145153 #if false // NHibernateでアクセスするとn^2オーダーの時間がかかる.
146154 var segs = m_Session.CreateSQLQuery(
@@ -161,8 +169,7 @@
161169 m_Segs = new Dictionary<long, int>();
162170 using (var cmd = m_Session.Connection.CreateCommand())
163171 {
164- cmd.CommandText = string.Format("SELECT id FROM segment WHERE sentence_id={0} and tag_definition_id={1} ORDER BY start_char",
165- sen.ID, tagid);
172+ cmd.CommandText = $"SELECT id FROM segment WHERE sentence_id={sen.ID} and tag_definition_id={tagid} and project_id={m_ProjId} ORDER BY start_char";
166173 var rdr = cmd.ExecuteReader();
167174 long last = -1;
168175 int i = 0;
@@ -207,7 +214,7 @@
207214 var tosegid = (long)rdr[2];
208215 var linktagname = (string)rdr[3];
209216 var score = 0.0;
210- Double.TryParse((rdr[4] as string)??string.Empty, out score);
217+ Double.TryParse((rdr[4] as string) ?? string.Empty, out score);
211218 m_Links.Add(fromsegid, new object[] { linkid, tosegid, linktagname, score });
212219 }
213220 rdr.Close();
@@ -232,10 +239,7 @@
232239 foreach (var a in sen.Attributes)
233240 {
234241 string csa = string.Format("{0}:{1}", crps.Name, a.ID);
235- if (m_SentenceTags.TryGetValue(csa, out pair))
236- {
237- m_TextWriter.WriteLine("#! SENTENCETAG {0}", pair.Seqid);
238- }
242+ WriteSentenceTag(csa);
239243 }
240244
241245 t3 += (DateTime.Now.Ticks - t0);
@@ -252,7 +256,7 @@
252256 cmd.CommandText =
253257 "SELECT w.lexeme_id, w.bunsetsu_segment_id, w.head_info, w.extra_chars" +
254258 " FROM word w" +
255- $" WHERE w.sentence_id={sen.ID} AND w.project_id={project_id} ORDER BY w.position ASC";
259+ $" WHERE w.sentence_id={sen.ID} AND w.project_id={m_ProjId} ORDER BY w.position ASC";
256260 var rdr = cmd.ExecuteReader();
257261 while (rdr.Read())
258262 {
@@ -368,9 +372,12 @@
368372 {
369373 if (m_TextWriter == null) throw new InvalidOperationException("TextWriter is null.");
370374
371- foreach (KeyValuePair<string, SeqIDTagPair> pair in m_SentenceTags)
375+ if (m_OutputSentencetagids)
372376 {
373- m_TextWriter.Write(pair.Value.Tag);
377+ foreach (KeyValuePair<string, SeqIDTagPair> pair in m_SentenceTags)
378+ {
379+ m_TextWriter.Write(pair.Value.Tag);
380+ }
374381 }
375382 }
376383
@@ -413,7 +420,7 @@
413420 protected bool WriteAnnotations(Sentence sen)
414421 {
415422 bool hasAnnotation = false;
416- IList<Group> grps = Util.RetrieveWordGroups(m_Session, sen);
423+ IList<Group> grps = Util.RetrieveWordGroups(m_Session, sen, m_ProjId);
417424
418425 int senOffset = sen.StartChar;
419426 int segno = 0; // 本メソッドローカルのSegment番号(文内)
@@ -438,7 +445,7 @@
438445 }
439446
440447 // Groupの一部およびCabochaで表現される以外のSegmentを出力する
441- IList<long> segids = Util.RetrieveMiscSegments(m_Session, sen); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment
448+ IList<long> segids = Util.RetrieveMiscSegments(m_Session, sen, m_ProjId); // Seg.ID, Seg.tagname, Seg.StartChar, Seg.EndChar, Seg.Comment
442449 foreach (var segid in segids)
443450 {
444451 if (seg_index.ContainsKey(segid)) // 既にGroupの一部としてExportされたSegmentは除外する.
@@ -464,7 +471,7 @@
464471 }
465472
466473 // Cabochaで表現される以外のLinkを出力する
467- IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen); // fromsegid, tosegid, tagname, comment
474+ IList<long> linkids = Util.RetrieveMiscLinks(m_Session, sen, m_ProjId); // fromsegid, tosegid, tagname, comment
468475 foreach (var linkid in linkids)
469476 {
470477 try
@@ -525,7 +532,10 @@
525532 {
526533 foreach (var attr in doc.Attributes)
527534 {
528- m_TextWriter.WriteLine("#! ATTR \"{0}\" \"{1}\" \"{2}\"", attr.Key, attr.Value, Util.EscapeQuote(attr.Comment));
535+ if (!attr.Key.StartsWith("@")) // "@"で始まるのはSentenceAttributeなのでDocumentAttributeとしては出力しない(SENTENCETAGIDとして出力される)
536+ {
537+ m_TextWriter.WriteLine("#! ATTR \"{0}\" \"{1}\" \"{2}\"", attr.Key, attr.Value, Util.EscapeQuote(attr.Comment));
538+ }
529539 }
530540 return;
531541 }
--- trunk/ChaKi.NET/src/Service/Export/ExportServiceConll.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Export/ExportServiceConll.cs (revision 649)
@@ -19,11 +19,11 @@
1919 {
2020 }
2121
22- private long currentBunsetsuID;
23- private int bunsetsuPos;
24- private List<string> sb = new List<string>(); // Form
25- private List<string> sb2 = new List<string>(); // Base
26- private const string EMPTY_COL = "_";
22+ protected long currentBunsetsuID;
23+ protected int bunsetsuPos;
24+ protected List<string> sb = new List<string>(); // Form
25+ protected List<string> sb2 = new List<string>(); // Base
26+ protected const string EMPTY_COL = "_";
2727
2828 protected override void WriteWords(IList<Word> words)
2929 {
@@ -110,7 +110,7 @@
110110 m_TextWriter.WriteLine();
111111 }
112112
113- private int GetDependToIndex(long buns_id, out string dependAs)
113+ protected int GetDependToIndex(long buns_id, out string dependAs)
114114 {
115115 long toSegID = -1;
116116 object[] link = null;
@@ -129,7 +129,7 @@
129129 return index;
130130 }
131131
132- private string GetFeats(Word w)
132+ protected virtual string GetFeats(Word w)
133133 {
134134 var feats = new List<string>();
135135 if (w.Extras.Length > 0)
--- trunk/ChaKi.NET/src/Service/Export/ExportServiceConllU.cs (nonexistent)
+++ trunk/ChaKi.NET/src/Service/Export/ExportServiceConllU.cs (revision 649)
@@ -0,0 +1,189 @@
1+using ChaKi.Entity.Corpora;
2+using ChaKi.Entity.Corpora.Annotations;
3+using System;
4+using System.Collections.Generic;
5+using System.IO;
6+using System.Linq;
7+using System.Text;
8+using System.Xml;
9+
10+namespace ChaKi.Service.Export
11+{
12+ public class ExportServiceConllU : ExportServiceConll
13+ {
14+ public ExportServiceConllU(TextWriter wr)
15+ : base(wr)
16+ {
17+ // SENTENCETAGは出力しない. 代わりに各Sentenceの冒頭でコメント行を出力.
18+ m_OutputSentencetagids = false;
19+ }
20+
21+ protected override void WriteWords(IList<Word> words)
22+ {
23+ // ここで渡されるwordsには、Bunsetsu.ID, HeadInfo, Lex, Extras しかセットされていないtransient objectなので注意.
24+ // ExportServiceCabocha.ExportItem()を参照.
25+
26+ currentBunsetsuID = -1;
27+ bunsetsuPos = 0;
28+ sb.Clear();
29+ sb2.Clear();
30+
31+ var t00 = DateTime.Now.Ticks;
32+
33+ var wordsarray = words.Cast<Word>().ToArray();
34+
35+ t5 += (DateTime.Now.Ticks - t00);
36+ t00 = DateTime.Now.Ticks;
37+
38+ int n = wordsarray.Length;
39+ for (int i = 0; i < n; i++)
40+ {
41+ var t0 = DateTime.Now.Ticks;
42+
43+ Word w = wordsarray[i];
44+
45+ t1 += (DateTime.Now.Ticks - t0);
46+ t0 = DateTime.Now.Ticks;
47+
48+ var lex = w.Lex;
49+ if (lex == null)
50+ {
51+ continue;
52+ }
53+ sb.Add(lex.Surface);
54+ sb.Add(w.Extras);
55+ sb2.Add(lex.BaseLexeme.Surface);
56+ sb2.Add(w.Extras);
57+
58+ var pos = lex.PartOfSpeech;
59+ t2 += (DateTime.Now.Ticks - t0);
60+ t0 = DateTime.Now.Ticks;
61+
62+ var buns = w.Bunsetsu.ID;
63+ if (currentBunsetsuID != buns || i == n - 1)
64+ {
65+ // Output word + dependency
66+ // 自身
67+ int buns_pos = -1;
68+ m_Segs.TryGetValue(buns, out buns_pos);
69+ // 係り先
70+ string depAs;
71+ var depTo = GetDependToIndex(buns, out depAs);
72+
73+ if (sb.Count > 0)
74+ {
75+ sb.RemoveAt(sb.Count - 1); // 最後のExtraは出力しない.
76+ }
77+ if (sb2.Count > 0)
78+ {
79+ sb2.RemoveAt(sb2.Count - 1);
80+ }
81+ m_TextWriter.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}",
82+ buns_pos + 1, // ID
83+ string.Join("", sb.ToArray()), // FORM
84+ string.Join("", sb2.ToArray()), // LEMMA
85+ pos.Name1, // UPOS
86+ GetXPOS(pos), // XPOS
87+ GetFeats(w), // FEATS
88+ depTo + 1, // HEAD
89+ depAs, // DEPREL
90+ EMPTY_COL, // DEPS
91+ GetMisc(buns, lex)); // MISC
92+ sb.Clear();
93+ sb2.Clear();
94+ currentBunsetsuID = buns;
95+ bunsetsuPos++;
96+ }
97+ }
98+ t6 += (DateTime.Now.Ticks - t00);
99+ }
100+
101+ private string GetXPOS(PartOfSpeech pos)
102+ {
103+ // pos.Nameの'-'で区切られた最初の要素を除く部分を返す.
104+ var p = pos.Name.IndexOf('-');
105+ if (p >= 0 && p < (pos.Name.Length - 1))
106+ {
107+ return pos.Name.Substring(p + 1);
108+ }
109+ return string.Empty;
110+ }
111+
112+ protected override string GetFeats(Word w)
113+ {
114+ var feats = new List<string>();
115+ if (w.Extras.Length > 0)
116+ {
117+ feats.Add("SP");
118+ }
119+ // 今のところ、Word Featureをこれ以外に保持していない。Segment Attributeをここに入れるか?
120+
121+ var result = string.Join("|", feats.ToArray());
122+ if (result.Length > 0)
123+ {
124+ return result;
125+ }
126+ return EMPTY_COL;
127+ }
128+
129+ // Segment AttributeとLexeme.CustomPropertyからCONLLU MISCフィールドを生成
130+ private string GetMisc(long segid, Lexeme lex)
131+ {
132+ var fields = new List<string>();
133+
134+ // Segment Attributes
135+ using (var cmd = m_Session.Connection.CreateCommand())
136+ {
137+ cmd.CommandText =
138+ "SELECT attribute_key,attribute_value" +
139+ " FROM segment_attribute" +
140+ $" WHERE segment_id={segid} AND project_id={m_ProjId} ORDER BY id ASC";
141+ var rdr = cmd.ExecuteReader();
142+ while (rdr.Read())
143+ {
144+ var key = rdr.GetString(0);
145+ var val = rdr.GetString(1);
146+ fields.Add($"{key}={val}");
147+ }
148+ }
149+
150+
151+ // Lexeme.CustomProperty
152+ var cust = lex.CustomProperty;
153+ foreach (var prop in cust.Split('\n'))
154+ {
155+ var pair = prop.Split('\t');
156+ if (pair.Length == 2)
157+ {
158+ fields.Add($"{pair[0]}={pair[1]}");
159+ }
160+ }
161+
162+ if (fields.Count > 0)
163+ {
164+ return string.Join("|", fields);
165+ }
166+ return EMPTY_COL;
167+ }
168+
169+ protected override void WriteSentenceTag(string csa)
170+ {
171+ m_SentenceTags.TryGetValue(csa, out var seqtagpair);
172+ var t = seqtagpair.Tag.Split('\t');
173+ if (t.Length < 3) return;
174+ var s = $"<Root>{t[2]}</Root>";
175+ using (TextReader trdr = new StringReader(s))
176+ {
177+ XmlReader xrdr = XmlReader.Create(trdr);
178+ while (xrdr.Read())
179+ {
180+ if (xrdr.Name.Equals("Root")) continue;
181+ var key = xrdr.Name;
182+ var val = xrdr.ReadString();
183+ if (key.Length == 0) continue;
184+ m_TextWriter.WriteLine($"# {key} = {val}");
185+ }
186+ }
187+ }
188+ }
189+}
--- trunk/ChaKi.NET/src/Service/Export/ExportServiceMweToConll.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Export/ExportServiceMweToConll.cs (revision 649)
@@ -61,7 +61,7 @@
6161
6262 private void ExportItem(Corpus crps, Sentence sen, int project_id = 0)
6363 {
64- var groups = Util.RetrieveMWEGroups(m_Session, sen);
64+ var groups = Util.RetrieveMWEGroups(m_Session, sen, project_id);
6565
6666 if (groups.Count > 0)
6767 {
--- trunk/ChaKi.NET/src/Service/Readers/CabochaBunsetsu.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Readers/CabochaBunsetsu.cs (revision 649)
@@ -24,6 +24,8 @@
2424 public double Score { get; set; }
2525 public int HeadInd { get; set; } // 自立語主辞の文節内位置
2626 public int HeadAnc { get; set; } // 付属語(Ancillary word)主辞の文節内位置
27+ public string Comment { get; set; }
28+ public Dictionary<string, string> Attrs { get; set; } = new Dictionary<string, string>();
2729
2830 public CabochaBunsetsu(Sentence sen, Document doc, int startPos, int bunsetsuPos, string dependsAs, int dependsTo, double score)
2931 : this(sen, doc, startPos, startPos, bunsetsuPos, dependsAs, dependsTo, score)
--- trunk/ChaKi.NET/src/Service/Readers/ConllReader.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Readers/ConllReader.cs (revision 649)
@@ -112,25 +112,25 @@
112112 }
113113
114114 // 現在のChar Position (Document毎のPos)
115- private int m_CurCharPos;
115+ private protected int m_CurCharPos;
116116 // 現在のSentence番号(通しIDおよびDocument毎のPos)
117- private int m_CurSenID;
117+ private protected int m_CurSenID;
118118 // 現在のDocument
119- private Document m_CurDoc;
119+ private protected Document m_CurDoc;
120120 // 現在の文
121- private Sentence m_CurSen;
121+ private protected Sentence m_CurSen;
122122 // 文節データの一時リスト
123- private CabochaBunsetsuList m_BunsetsuList;
123+ private protected CabochaBunsetsuList m_BunsetsuList;
124124 // 現在(最後に読み込んだ)文節
125- private CabochaBunsetsu m_CurBunsetsu;
125+ private protected CabochaBunsetsu m_CurBunsetsu;
126126 // 現在の文内において、係り先が-1であるような文節のリスト
127- private List<CabochaBunsetsu> m_CurTerminalBunsetsu;
127+ private protected List<CabochaBunsetsu> m_CurTerminalBunsetsu;
128128 // Document全体の平文内容
129- private StringBuilder m_DocumentTextBuilder;
129+ private protected StringBuilder m_DocumentTextBuilder;
130130 // 今読んでいるSentenceに付属するBunsetsu以外のSegmentリスト
131- private List<Segment> m_CurSegments;
131+ private protected List<Segment> m_CurSegments;
132132 // 現在の複合語Chunk
133- private CompositeWordChunk m_CompositeWordChunk;
133+ private protected CompositeWordChunk m_CompositeWordChunk;
134134
135135 /// <summary>
136136 /// データをデータをストリームからDocumentに読み込む。
@@ -299,7 +299,7 @@
299299 }
300300 else if (s.StartsWith("#"))
301301 {
302- // Ignore
302+ ProcessCommentLine(s);
303303 }
304304 else if (s.Trim().Length > 0)
305305 {
@@ -380,8 +380,20 @@
380380 seg.Doc = buns.Doc;
381381 seg.Sentence = buns.Sen;
382382 seg.Version = TagSet.CurrentVersion;
383+ seg.Comment = buns.Comment;
383384 m_Corpus.AddSegment(seg);
384385 buns.Seg = seg;
386+ string uniDicLemma = null;
387+ foreach (var pair in buns.Attrs)
388+ {
389+ seg.Attributes.Add(new SegmentAttribute() {
390+ Proj = seg.Proj,
391+ Target = seg,
392+ User = seg.User,
393+ Version = seg.Version,
394+ Key = pair.Key,
395+ Value = pair.Value });
396+ }
385397 foreach (var w in buns.Words)
386398 {
387399 w.Bunsetsu = seg;
@@ -420,8 +432,13 @@
420432 Console.WriteLine("> {0} Links.", m_BunsetsuList.Count);
421433 }
422434
435+ protected virtual void ProcessCommentLine(string s)
436+ {
437+ // do nothing for Comment line (default).
438+ }
439+
423440 // 各単語に1文節を割り当て、係り受けを付与する
424- private void ProcessOneLine_1(string s, string[] fields)
441+ protected virtual void ProcessOneLine_1(string s, string[] fields)
425442 {
426443 var originalSurface = fields[1];
427444 Lexeme m = null;
@@ -498,7 +515,7 @@
498515
499516 // FEATSフィールドの IOB2 タグに応じて複数の行に1語を割り当て、さらにその1語に1文節と係り受けを付与する.
500517 // 複合語内の係り受けは破棄する.
501- private void ProcessOneLine_2(string s, string[] fields)
518+ protected virtual void ProcessOneLine_2(string s, string[] fields)
502519 {
503520 var feats = new string[0];
504521 if (fields.Length > 5 && fields[5] != "_")
--- trunk/ChaKi.NET/src/Service/Readers/ConllUReader.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Readers/ConllUReader.cs (revision 649)
@@ -9,98 +9,21 @@
99
1010 namespace ChaKi.Service.Readers
1111 {
12- public class ConllUReader : CorpusSourceReader
12+ public class ConllUReader : ConllReader
1313 {
14- private static TagSet DefaultTagSet;
15-
16- protected Corpus m_Corpus;
17-
18- static ConllUReader()
19- {
20- // CONLL用のデフォルトTagSet
21- DefaultTagSet = new TagSet("CabochaTagSet");
22- DefaultTagSet.AddTag(new Tag(Tag.SEGMENT, "Bunsetsu"));
23-
24- DefaultTagSet.AddVersion(new TagSetVersion("1", 0, true));
25-
26- CabochaReader.TagSet = null;
27- }
28-
2914 public ConllUReader(Corpus corpus, LexiconBuilder lb)
15+ : base(corpus, lb)
3016 {
31- throw new NotImplementedException("CONLLU format is not fully implemented yet.");
32- m_Corpus = corpus;
33- this.LexiconBuilder = lb;
34-
35- // TagSetの初期値はCabocha Defaultとし、インポート中に出現したTagを随時加える.
36- // 複数ファイルインポートの場合、ファイルを読み込むたびにここを通るが、
37- // そのたびごとにstaticなTagSetをリセットすると、Segment, Link等から参照しているTagが
38- // 次々と異なるものになってしまうので、既にTagSetに存在しているものを変更しないようにする.
39- if (CabochaReader.TagSet == null)
40- {
41- CabochaReader.TagSet = new TagSet();
42- }
43- CabochaReader.TagSet.MergeWith(DefaultTagSet);
4417 }
4518
46- public LexiconBuilder LexiconBuilder { get; set; } // 原始Lexicon
47-
48- public string EncodingToUse { get; set; }
49-
50- public bool FromDictionary { get; set; } // 辞書ファイルを読み込む場合はTrue; テキストを読み込む場合はFalse
51- private static TagSet TagSet => CabochaReader.TagSet;
52-
53- /// <summary>
54- /// CONLLデータをCorpusに読み込む(係り受け情報はSegment,Linkテーブルに入れる)
55- /// </summary>
56- /// <param name="path"></param>
57- /// <param name="encoding"></param>
58- public Document ReadFromFileSLA(string path, string encoding)
59- {
60- var newdoc = new Document();
61- newdoc.FileName = path;
62- using (var streamReader = new StreamReader(path, Encoding.GetEncoding(encoding)))
63- {
64- ReadFromStreamSLA(streamReader, -1, newdoc);
65- }
66- return newdoc;
67- }
68-
69- [Obsolete]
70- public Document ReadFromFile(string path, string encoding)
71- {
72- throw new System.NotImplementedException();
73- }
74-
75- // 現在のDocument
76- private Document m_CurDoc;
77- // 現在の文
78- private Sentence m_CurSen;
79- // 現在のSentence番号(通しIDおよびDocument毎のPos)
80- private int m_CurSenID;
81- // 現在のChar Position (Document毎のPos)
82- private int m_CurCharPos;
83- // 文節データの一時リスト
84- private CabochaBunsetsuList m_BunsetsuList;
85- // 現在(最後に読み込んだ)文節
86- private CabochaBunsetsu m_CurBunsetsu;
87- // 現在の文内において、係り先が-1であるような文節のリスト
88- private List<CabochaBunsetsu> m_CurTerminalBunsetsu;
89- // 文のDocument単位の出現リスト
90- private List<Sentence> m_SentencesInDoc;
91- // 現在の複合語Chunk
92- private CompositeWordChunk m_CompositeWordChunk;
93- // Document全体の平文内容
94- private StringBuilder m_DocumentTextBuilder;
95-
9619 // 各単語に1文節を割り当て、係り受けを付与する
97- private void ProcessOneLine_1(string s, string[] fields)
20+ protected override void ProcessOneLine_1(string s, string[] fields)
9821 {
9922 var originalSurface = fields[1];
10023 Lexeme m = null;
10124 try
10225 {
103- m = this.LexiconBuilder.AddEntryConll(s, this.FromDictionary, false);
26+ m = this.LexiconBuilder.AddEntryConllU(s, this.FromDictionary, false);
10427 }
10528 catch (Exception)
10629 {
@@ -119,6 +42,27 @@
11942 }
12043 var buns = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurCharPos, f0, f7, f6, 0.0);
12144 buns.EndPos = buns.StartPos + originalSurface.Length;
45+ if (fields.Length > 9 && fields[9] != "_")
46+ {
47+ foreach (var pairs in fields[9].Split('|'))
48+ {
49+ var pair = pairs.Split('=');
50+ if (pair.Length == 2 && !buns.Attrs.ContainsKey(pair[0]))
51+ {
52+ if (pair[0] == "UniDicLemma")
53+ {
54+ // UniDicLemma属性はlexemeに持たせる.
55+ m.CustomProperty = $"{pair[0]}\t{pair[1]}";
56+ }
57+ else
58+ {
59+ // それ以外はBunsetsu SegmentのAttributeに持たせる.
60+ buns.Attrs.Add(pair[0], pair[1]);
61+ }
62+ }
63+
64+ }
65+ }
12266 m_CurBunsetsu = buns;
12367 if (buns.DependsTo == -1)
12468 {
@@ -130,35 +74,16 @@
13074 {
13175 Console.WriteLine(string.Format("Bunsetsu parse error: {0}", s));
13276 }
133- Word w = null;
134- var feats = new string[0];
135- if (fields.Length > 5 && fields[5] != "_")
77+ // SentenceとBunsetsuにWordを追加.
78+ var w = m_CurSen.AddWord(m);
79+ w.StartChar = m_CurCharPos;
80+ w.EndChar = m_CurCharPos + originalSurface.Length; // Word.Lengthを使ってもよいが、空白を含む文字列長であることに注意.
81+ // Surfaceの末尾の空白をWordに記録
82+ if (!originalSurface.Equals(m.Surface))
13683 {
137- feats = fields[5].Split('|');
84+ w.Extras = GetDiff(originalSurface, m.Surface);
13885 }
139- if (feats.Contains("SP"))
140- {
141- // FEATURESで空白を指定している場合
142- // SentenceとBunsetsuにWordを追加.
143- w = m_CurSen.AddWord(m);
144- w.StartChar = m_CurCharPos;
145- w.EndChar = m_CurCharPos + w.CharLength + 1;
146- w.Extras = " ";
147- m_DocumentTextBuilder.Append(originalSurface + w.Extras);
148- }
149- else
150- {
151- // SentenceとBunsetsuにWordを追加.
152- w = m_CurSen.AddWord(m);
153- w.StartChar = m_CurCharPos;
154- w.EndChar = m_CurCharPos + originalSurface.Length; // Word.Lengthを使ってもよいが、空白を含む文字列長であることに注意.
155- // Surfaceの末尾の空白をWordに記録
156- //if (!originalSurface.Equals(m.Surface))
157- //{
158- // w.Extras = GetDiff(originalSurface, m.Surface);
159- //}
160- m_DocumentTextBuilder.Append(originalSurface);
161- }
86+ m_DocumentTextBuilder.Append(originalSurface);
16287
16388 // 文節にこの語を割り当てる
16489 if (m_CurBunsetsu != null)
@@ -169,260 +94,31 @@
16994 }
17095 }
17196
172- public void ReadFromStreamSLA(TextReader rdr, int sentenceCount, Document doc)
97+ // CONLLUではFEATSフィールドの IOB2 タグは見ない.
98+ protected override void ProcessOneLine_2(string s, string[] fields)
17399 {
174- this.FromDictionary = false;
175-
176- string s;
177-
178- m_CurDoc = doc;
179- m_BunsetsuList = new CabochaBunsetsuList();
180- m_CurCharPos = 0;
181- m_CurSenID = 0;
182- m_CurSen = new Sentence(m_CurDoc) { ID = m_CurSenID++, Pos = 0 };
183- m_CurBunsetsu = null; // 最後に読んだ文節
184- m_CurTerminalBunsetsu = new List<CabochaBunsetsu>(); // 現在の文内において、係り先が-1であるような文節
185- //m_CurSegments = new List<Segment>(); // 今読んでいるSentenceに付属するBunsetsu以外のSegmentリスト
186- m_DocumentTextBuilder = new StringBuilder();
187- object lastAnnotationTag = m_CurDoc; // 現在の文内において、最後に読んだDocumentまたはSegment(文節以外)またはLinkまたはGroup
188- m_SentencesInDoc = new List<Sentence>();
189- //m_SegmentsInDoc = new List<Segment>();
190- m_CompositeWordChunk = new CompositeWordChunk();
191-
192- var currentComposite = string.Empty;
193-
194- int n = 0;
195- while (true)
196- {
197- s = rdr.ReadLine();
198- if (s == null)
199- {
200- if (m_CurSen.Words.Count > 0) // 正常なファイル末尾なら、m_CurSenはクリアされているはず。
201- {
202- s = string.Empty;// ファイル末尾にEOS行(空行)がないので、空行の存在をシミュレートする.
203- }
204- else
205- {
206- break;
207- }
208- }
209- s = Cleanup(s); // ファイル途中のBOM(catした時に残っている場合がある)を削除する
210-
211- if (s.StartsWith("#"))
212- {
213- // Ignore
214- }
215- else if (s.Trim().Length > 0)
216- {
217- // 語を表す1行を処理する.
218- var fields = s.Split('\t');
219- try
220- {
221- ProcessOneLine_2(s, fields);
222- }
223- catch (Exception ex)
224- {
225- Console.WriteLine("At line {0}: Error: {1}", n, ex.Message);
226- }
227- }
228- else
229- {
230- // CONLL: 空行は文の終わり
231- if (m_CurBunsetsu == null)
232- { // デフォルト文節を追加(入力がChasen/Mecabの場合のため)
233- var buns = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurSen.StartChar, 0, String.Empty, -1, 0.0);
234- buns.EndPos = m_CurCharPos;
235- m_BunsetsuList.Add(buns);
236- m_CurBunsetsu = buns;
237- m_CurTerminalBunsetsu.Add(buns);
238- }
239- // 終端ダミー文節を追加
240- var dummy = new CabochaBunsetsu(m_CurSen, m_CurDoc, m_CurCharPos, m_CurBunsetsu.BunsetsuPos + 1, String.Empty, -1, 0.0);
241- m_BunsetsuList.Add(dummy);
242- // 係り先が-1である文節をdummyに係るようにする。
243- if (m_CurTerminalBunsetsu != null && m_CurTerminalBunsetsu.Count > 0)
244- {
245- foreach (var buns in m_CurTerminalBunsetsu)
246- {
247- buns.DependsTo = dummy.BunsetsuPos;
248- }
249- }
250-
251- if (++n % 1000 == 0)
252- {
253- Console.Write("> {0} Sentences.\r", n);
254- }
255- m_CurSen.EndChar = m_CurCharPos;
256-
257- m_SentencesInDoc.Add(m_CurSen);
258- m_Corpus.AddSentence(m_CurSen);
259- if (sentenceCount > 0 && n >= sentenceCount)
260- {
261- break;
262- }
263- // 以降のWordのために、新しいSentenceを初期化して用意する。
264- int lastsenpos = m_CurSen.Pos + 1;
265- m_CurSen = new Sentence(m_CurDoc) { ID = m_CurSenID++, Pos = lastsenpos };
266- m_CurSen.StartChar = m_CurCharPos;
267- m_CurBunsetsu = null;
268- m_CurTerminalBunsetsu.Clear();
269- //m_CurSegments = new List<Segment>();
270- m_CompositeWordChunk.Clear();
271- lastAnnotationTag = null;
272- }
273- }
274- m_CurDoc.Text = m_DocumentTextBuilder.ToString();
275- Console.Write("> {0} Sentences.\r", n);
276-
277- Console.WriteLine();
278- // BunsetsuをSegment&LinkとしてCorpusに登録
279- var bunsetsuTag = TagSet.FindTag(Tag.SEGMENT, "Bunsetsu");
280- n = 0;
281- foreach (var buns in m_BunsetsuList.Values)
282- {
283- if (++n % 100 == 0)
284- {
285- Console.Write("> {0} Segments.\r", n);
286- }
287- var seg = new Segment();
288- seg.StartChar = buns.StartPos;
289- seg.EndChar = buns.EndPos;
290- seg.Tag = bunsetsuTag;
291- seg.Doc = buns.Doc;
292- seg.Sentence = buns.Sen;
293- seg.Version = TagSet.CurrentVersion;
294- m_Corpus.AddSegment(seg);
295- buns.Seg = seg;
296- foreach (var w in buns.Words)
297- {
298- w.Bunsetsu = seg;
299- }
300- }
301- Console.WriteLine("> {0} Segments.", m_BunsetsuList.Count);
302- n = 0;
303- foreach (var buns in m_BunsetsuList.Values)
304- {
305- if (++n % 100 == 0)
306- {
307- Console.Write("> {0} Links.\r", n);
308- }
309- var depBunsetsu = m_BunsetsuList.Find(buns.Sen, buns.DependsTo);
310- if (depBunsetsu != null)
311- {
312- var link = new Link();
313- link.From = buns.Seg;
314- link.To = depBunsetsu.Seg;
315- link.FromSentence = buns.Sen;
316- link.ToSentence = buns.Sen;
317- link.Tag = TagSet.FindOrAddTag(Tag.LINK, buns.DependsAs);
318- link.Version = TagSet.CurrentVersion;
319- link.Attributes.Add(new LinkAttribute()
320- {
321- Proj = link.Proj,
322- Target = link,
323- User = link.User,
324- Version = link.Version,
325- Key = "Score",
326- Value = buns.Score.ToString()
327- });
328- m_Corpus.AddLink(link);
329- }
330- }
331- Console.WriteLine("> {0} Links.", m_BunsetsuList.Count);
100+ ProcessOneLine_1(s, fields);
332101 }
333102
334- // FEATSフィールドの IOB2 タグに応じて複数の行に1語を割り当て、さらにその1語に1文節と係り受けを付与する.
335- // 複合語内の係り受けは破棄する.
336- private void ProcessOneLine_2(string s, string[] fields)
103+ protected override void ProcessCommentLine(string s)
337104 {
338- var feats = new string[0];
339- if (fields.Length > 5 && fields[5] != "_")
105+ // コメント行にあるsent_idをSentence属性に格納
106+ if (s.StartsWith("# sent_id = "))
340107 {
341- feats = fields[5].Split('|');
342- }
343- var btag = feats.FirstOrDefault(f => f.StartsWith("B-")); // 複数のBタグがあった場合、最初のもののみ有効.
344- var itag = feats.FirstOrDefault(f => f.StartsWith("I-"));
345-
346- // 複合語の終了か
347- if (itag == null)
348- {
349- // 現在までに収集したCompositeWordChunkがあれば先に出力
350- if (!m_CompositeWordChunk.IsEmpty())
108+ // SentenceAttrはDocumentAttrへのリンクになる.
109+ var attrid = m_CurDoc.Attributes.Count;
110+ m_CurDoc.Attributes.Add(new DocumentAttribute()
351111 {
352- try
353- {
354- var scw = m_CompositeWordChunk.ToConllSingleLine();
355- var scwf = scw.Split('\t');
356- ProcessOneLine_1(scw, scwf);
357- }
358- finally
359- {
360- m_CompositeWordChunk.Clear();
361- }
362- }
363- }
364- if (btag != null) // BならChunkを初期化
365- {
366- m_CompositeWordChunk.Clear();
367- m_CompositeWordChunk.ChunkPOS = btag.Substring(2);
368- }
369- if (btag != null || itag != null) // BまたはI
370- {
371- // 現在の語をCompositeWordChunkに追加し、出力は行わない.
372- var f0 = Int32.Parse(fields[0]) - 1;
373- var f5 = (fields.Length > 5) ? fields[5] : "_";
374- var f6 = f0 + 1;
375- if (fields.Length > 6 && fields[6] != "_")
112+ ID = attrid,
113+ Key = "@sent_id",
114+ Value = s.Substring(13)
115+ });
116+ m_CurSen.Attributes.Add(new SentenceAttribute()
376117 {
377- f6 = Int32.Parse(fields[6]) - 1;
378- }
379- var f7 = (fields.Length > 7) ? fields[7] : string.Empty;
380- m_CompositeWordChunk.Add(f0, fields[1], fields[2], f5, f6, f7);
118+ ID = attrid
119+ });
381120 }
382- else // BでもIでもない
383- {
384- // 現在の行を出力
385- ProcessOneLine_1(s, fields);
386- }
387121 }
388122
389- public void ReadLexiconFromStream(TextReader rdr, bool baseOnly)
390- {
391- throw new System.NotImplementedException();
392- }
393-
394- public void ReadLexiconFromStream(TextReader rdr)
395- {
396- throw new System.NotImplementedException();
397- }
398-
399- public void SetFieldDefs(Field[] fieldDefs)
400- {
401- throw new System.NotImplementedException();
402- }
403-
404- /// <summary>
405- /// 行頭のBOMを除去する
406- /// </summary>
407- /// <param name="input"></param>
408- /// <returns></returns>
409- public static string Cleanup(string input)
410- {
411- var sb = new StringBuilder();
412- var firstChar = true;
413- foreach (var c in input)
414- {
415- if (c != 0xFEFF)
416- {
417- sb.Append(c);
418- if (firstChar)
419- {
420- return input;
421- }
422- }
423- firstChar = false;
424- }
425- return sb.ToString();
426- }
427123 }
428124 }
--- trunk/ChaKi.NET/src/Service/Readers/LexiconBuilder.cs (revision 648)
+++ trunk/ChaKi.NET/src/Service/Readers/LexiconBuilder.cs (revision 649)
@@ -639,5 +639,54 @@
639639
640640 return ret;
641641 }
642+
643+ public Lexeme AddEntryConllU(string s, bool fromDictionary, bool baseOnly)
644+ {
645+ string[] props = SplitConllUFormat(s);
646+ if (fromDictionary)
647+ {
648+ return AddEntrySimple(props, baseOnly);
649+ }
650+ else
651+ {
652+ return AddEntry(props);
653+ }
654+ }
655+
656+ public string[] SplitConllUFormat(string s)
657+ {
658+ var lex = new Lexeme();
659+
660+ var fields = s.Split('\t');
661+ if (fields.Length < 4)
662+ {
663+ throw new Exception(string.Format("Mismatch in field count. Required=4~10; Seen={0}", fields.Length));
664+ }
665+ var ret = new string[10] { string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty };
666+ // 0: ID in a sentence
667+ // 1: Surface Form
668+ ret[0] = fields[1].Trim(); // ここで末尾の空白を取る。インポート時に削除するのは原則末尾の空白のみ。
669+ // 2: Lemma Form
670+ if (!fields[2].Equals("_"))
671+ {
672+ ret[5] = fields[2];
673+ }
674+ // 3-4: POS
675+ if (!fields[3].Equals("_"))
676+ {
677+ ret[6] = fields[3];
678+ }
679+ if (fields.Length > 4 && !fields[4].Equals("_"))
680+ {
681+ ret[6] = string.Format("{0}-{1}", ret[6], fields[4]);
682+ }
683+ // 5: Features
684+ // 6: Head
685+ // 7: DepRel
686+ // 8: PHead
687+ // 9: PDepRel
688+
689+ return ret;
690+ }
642691 }
643692 }
--- trunk/ChaKi.NET/src/Text2Corpus/Properties/AssemblyInfo.cs (revision 648)
+++ trunk/ChaKi.NET/src/Text2Corpus/Properties/AssemblyInfo.cs (revision 649)
@@ -12,5 +12,5 @@
1212 [assembly: AssemblyCulture("")]
1313 [assembly: ComVisible(false)]
1414 [assembly: Guid("a8cf8403-eb88-418f-bf54-56aeaef39268")]
15-[assembly: AssemblyVersion("3.14.642.0")]
16-[assembly: AssemblyFileVersion("3.14.642.0")]
15+[assembly: AssemblyVersion("3.14.648.0")]
16+[assembly: AssemblyFileVersion("3.14.648.0")]
--- trunk/ChaKi.NET/src/Timings/Properties/AssemblyInfo.cs (revision 648)
+++ trunk/ChaKi.NET/src/Timings/Properties/AssemblyInfo.cs (revision 649)
@@ -12,5 +12,5 @@
1212 [assembly: AssemblyCulture("")]
1313 [assembly: ComVisible(false)]
1414 [assembly: Guid("ff6652ed-b932-466b-944b-ce88d698979b")]
15-[assembly: AssemblyVersion("3.14.642.0")]
16-[assembly: AssemblyFileVersion("3.14.642.0")]
15+[assembly: AssemblyVersion("3.14.648.0")]
16+[assembly: AssemblyFileVersion("3.14.648.0")]
Show on old repository browser