Develop and Download Open Source Software

Browse Subversion Repository

Contents of /misc/data/datagen.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 114 - (show annotations) (download) (as text)
Mon Apr 21 01:56:41 2008 UTC (15 years, 11 months ago) by mir
File MIME type: text/x-csrc
File size: 4885 byte(s)
Added ramdom Japanese data generator.

1 /*
2 Random Japanese data generator
3
4 @author Tetsuro IKEDA <ikdttr@gmail.com>
5
6 Copyright(c) 2008 Tritonn Project
7 License LGPL v2
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <errno.h>
13 #include <sys/time.h>
14
15 /* number of parses */
16 #define PARSE_NUM 25
17
18 typedef struct st_dict_info {
19 char *fname;
20 int raito;
21 } DICT_INFO;
22
23 DICT_INFO dict_info[] = {
24 {"Noun.others.csv", 10},
25 {"Noun.verbal.csv", 10},
26 {"Symbol.csv", 10},
27 {"Noun.demonst.csv", 10},
28 {"Interjection.csv", 10},
29 {"Prefix.csv", 10},
30 {"Noun.adjv.csv", 10},
31 {"Conjunction.csv", 80},
32 {"Noun.place.csv", 10},
33 {"Adj.csv", 30},
34 {"Noun.nai.csv", 10},
35 {"Noun.proper.csv", 10},
36 {"Postp-col.csv", 10},
37 {"Noun.csv", 80},
38 {"Noun.org.csv", 10},
39 {"Others.csv", 1},
40 {"Filler.csv", 10},
41 {"Adnominal.csv", 10},
42 {"Auxil.csv", 10},
43 {"Verb.csv", 60},
44 {"Adverb.csv", 30},
45 {"Noun.number.csv", 10},
46 {"Suffix.csv", 10},
47 {"Postp.csv", 10},
48 {"Noun.name.csv", 30},
49 {"Noun.adverbal.csv", 10}
50 };
51
52 int row_num[] = {
53 151,
54 12146,
55 208,
56 120,
57 252,
58 221,
59 3328,
60 171,
61 72999,
62 27210,
63 42,
64 27327,
65 91,
66 60477,
67 16668,
68 2,
69 19,
70 135,
71 199,
72 130750,
73 3032,
74 42,
75 1393,
76 146,
77 34202,
78 795
79 };
80
81 typedef struct st_parse {
82 char *name;
83 int row_num;
84 int *row_size;
85 char (*data)[128];
86 } PARSE;
87
88 PARSE dict[PARSE_NUM];
89
90 void print_parse(PARSE *p)
91 {
92 int i;
93 for (i=0; i < p->row_num; i++) {
94 printf("name=%s, row_num=%d, row_id=%d, row_size=%d, data=%s\n",
95 p->name,p->row_num,i+1,p->row_size[i],p->data[i]);
96 }
97 }
98
99 void print_dict(PARSE *p)
100 {
101 int i;
102 for (i=0; i < PARSE_NUM; i++) {
103 print_parse(p++);
104 }
105 }
106 /*
107 read .csv file into parse
108
109 PARSE *p : target parse to be loaded
110 this must have "name" as file name.
111 return : 0 success -1 failed
112 */
113 int load_parse(PARSE *p)
114 {
115 if (!p || !(p->name)) {
116 printf("load_parse: incorrect argument\n");
117 return -1;
118 }
119 FILE *f;
120 if (f = fopen(p->name, "r")) {
121 if (!(p->row_size = malloc((sizeof p->row_size) * p->row_num))) {
122 printf("malloc error\n");
123 return -1;
124 }
125 int msize = p->row_num * 128;
126 if (!(p->data = malloc(msize))) {
127 printf("cannot malloc for file '%s', malloc size=%d\n",msize);
128 return -1;
129 }
130 int line;
131 char *res;
132 for (line=0; line < p->row_num; line++) {
133 if(!(res = fgets(p->data[line],128,f))) {
134 printf("cannot read expected rows from %s, currently #%d\n",p->name,line+1);
135 return -1;
136 }
137 int i;
138 /* replace '\n' into '\0' */
139 for (i=0; i <= 128; i++) {
140 if ((p->data[line][i] == '\n') && (p->data[line][i+1] == '\0')) {
141 p->data[line][i] = '\0';
142 p->row_size[line] = i;
143 break;
144 }
145 }
146 }
147 } else {
148 printf("cannot open file '%s'\n",p->name);
149 return -1;
150 }
151 fclose(f);
152 return 0;
153 }
154
155 int raito_sum=0;
156 int dice_parse_id()
157 {
158 int i;
159 struct timeval t;
160 gettimeofday(&t,NULL);
161 int seed = (int) t.tv_usec;
162 /* cache raito_sum */
163 if (raito_sum == 0) {
164 for (i=0; i < PARSE_NUM; i++) raito_sum+=dict_info[i].raito;
165 }
166 int rand_id = rand_r(&seed);
167 rand_id = rand_id % raito_sum;
168 int curr_id=0;
169 for (i=0; i < PARSE_NUM; i++) {
170 curr_id += dict_info[i].raito;
171 if (curr_id > rand_id) {
172 break;
173 } else {
174
175 }
176 }
177 return i;
178 }
179
180 int dice_row_id(int dict_id)
181 {
182 struct timeval t;
183 gettimeofday(&t,NULL);
184 int seed = (int) t.tv_usec;
185 int rand_id = rand_r(&seed);
186 return rand_id % dict[dict_id].row_num;
187 }
188
189 int main(int argc, char *argv[])
190 {
191 if (argc != 3) {
192 printf("usage: datagen2 [row_size] [row_num]\n");
193 return 0;
194 }
195 int gen_row_size=atoi(argv[1]);
196 int gen_row_num=atoi(argv[2]);
197 char *buf;
198
199 int i;
200
201 /* init and load dictionaries */
202 for (i=0; i < PARSE_NUM; i++) {
203 dict[i].name = dict_info[i].fname;
204 dict[i].row_num = row_num[i];
205 if (load_parse(&dict[i])) {
206 printf("parse load error\n");
207 return -1;
208 }
209 }
210
211 int offset;
212 for (i=0; i < gen_row_num; i++) {
213 buf = malloc(gen_row_size+1);
214 offset = 0;
215 while (1) {
216 int dict_id = dice_parse_id();
217 int row_id = dice_row_id(dict_id);
218 int word_size = dict[dict_id].row_size[row_id];
219 if (offset + word_size > gen_row_size) break;
220 int j;
221 for (j=0; j < word_size; j++) {
222 buf[offset+j] = dict[dict_id].data[row_id][j];
223 }
224 offset += word_size;
225 }
226 printf("%s\n",buf);
227 free(buf);
228 }
229 return 0;
230 }
231

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26