Browse Subversion Repository
Contents of /misc/data/datagen.c
Parent Directory
| Revision Log
Revision 114 -
( show annotations)
( download)
( as text)
Mon Apr 21 01:56:41 2008 UTC
(15 years, 11 months ago)
by mir
File MIME type: text/x-csrc
File size: 4885 byte(s)
Added ramdom Japanese data generator.
| 1 |
/* |
| 2 |
Random Japanese data generator |
| 3 |
|
| 4 |
@author Tetsuro IKEDA <ikdttr@gmail.com> |
| 5 |
|
| 6 |
Copyright(c) 2008 Tritonn Project |
| 7 |
License LGPL v2 |
| 8 |
*/ |
| 9 |
|
| 10 |
#include <stdio.h> |
| 11 |
#include <stdlib.h> |
| 12 |
#include <errno.h> |
| 13 |
#include <sys/time.h> |
| 14 |
|
| 15 |
/* number of parses */ |
| 16 |
#define PARSE_NUM 25 |
| 17 |
|
| 18 |
typedef struct st_dict_info { |
| 19 |
char *fname; |
| 20 |
int raito; |
| 21 |
} DICT_INFO; |
| 22 |
|
| 23 |
DICT_INFO dict_info[] = { |
| 24 |
{"Noun.others.csv", 10}, |
| 25 |
{"Noun.verbal.csv", 10}, |
| 26 |
{"Symbol.csv", 10}, |
| 27 |
{"Noun.demonst.csv", 10}, |
| 28 |
{"Interjection.csv", 10}, |
| 29 |
{"Prefix.csv", 10}, |
| 30 |
{"Noun.adjv.csv", 10}, |
| 31 |
{"Conjunction.csv", 80}, |
| 32 |
{"Noun.place.csv", 10}, |
| 33 |
{"Adj.csv", 30}, |
| 34 |
{"Noun.nai.csv", 10}, |
| 35 |
{"Noun.proper.csv", 10}, |
| 36 |
{"Postp-col.csv", 10}, |
| 37 |
{"Noun.csv", 80}, |
| 38 |
{"Noun.org.csv", 10}, |
| 39 |
{"Others.csv", 1}, |
| 40 |
{"Filler.csv", 10}, |
| 41 |
{"Adnominal.csv", 10}, |
| 42 |
{"Auxil.csv", 10}, |
| 43 |
{"Verb.csv", 60}, |
| 44 |
{"Adverb.csv", 30}, |
| 45 |
{"Noun.number.csv", 10}, |
| 46 |
{"Suffix.csv", 10}, |
| 47 |
{"Postp.csv", 10}, |
| 48 |
{"Noun.name.csv", 30}, |
| 49 |
{"Noun.adverbal.csv", 10} |
| 50 |
}; |
| 51 |
|
| 52 |
int row_num[] = { |
| 53 |
151, |
| 54 |
12146, |
| 55 |
208, |
| 56 |
120, |
| 57 |
252, |
| 58 |
221, |
| 59 |
3328, |
| 60 |
171, |
| 61 |
72999, |
| 62 |
27210, |
| 63 |
42, |
| 64 |
27327, |
| 65 |
91, |
| 66 |
60477, |
| 67 |
16668, |
| 68 |
2, |
| 69 |
19, |
| 70 |
135, |
| 71 |
199, |
| 72 |
130750, |
| 73 |
3032, |
| 74 |
42, |
| 75 |
1393, |
| 76 |
146, |
| 77 |
34202, |
| 78 |
795 |
| 79 |
}; |
| 80 |
|
| 81 |
typedef struct st_parse { |
| 82 |
char *name; |
| 83 |
int row_num; |
| 84 |
int *row_size; |
| 85 |
char (*data)[128]; |
| 86 |
} PARSE; |
| 87 |
|
| 88 |
PARSE dict[PARSE_NUM]; |
| 89 |
|
| 90 |
void print_parse(PARSE *p) |
| 91 |
{ |
| 92 |
int i; |
| 93 |
for (i=0; i < p->row_num; i++) { |
| 94 |
printf("name=%s, row_num=%d, row_id=%d, row_size=%d, data=%s\n", |
| 95 |
p->name,p->row_num,i+1,p->row_size[i],p->data[i]); |
| 96 |
} |
| 97 |
} |
| 98 |
|
| 99 |
void print_dict(PARSE *p) |
| 100 |
{ |
| 101 |
int i; |
| 102 |
for (i=0; i < PARSE_NUM; i++) { |
| 103 |
print_parse(p++); |
| 104 |
} |
| 105 |
} |
| 106 |
/* |
| 107 |
read .csv file into parse |
| 108 |
|
| 109 |
PARSE *p : target parse to be loaded |
| 110 |
this must have "name" as file name. |
| 111 |
return : 0 success -1 failed |
| 112 |
*/ |
| 113 |
int load_parse(PARSE *p) |
| 114 |
{ |
| 115 |
if (!p || !(p->name)) { |
| 116 |
printf("load_parse: incorrect argument\n"); |
| 117 |
return -1; |
| 118 |
} |
| 119 |
FILE *f; |
| 120 |
if (f = fopen(p->name, "r")) { |
| 121 |
if (!(p->row_size = malloc((sizeof p->row_size) * p->row_num))) { |
| 122 |
printf("malloc error\n"); |
| 123 |
return -1; |
| 124 |
} |
| 125 |
int msize = p->row_num * 128; |
| 126 |
if (!(p->data = malloc(msize))) { |
| 127 |
printf("cannot malloc for file '%s', malloc size=%d\n",msize); |
| 128 |
return -1; |
| 129 |
} |
| 130 |
int line; |
| 131 |
char *res; |
| 132 |
for (line=0; line < p->row_num; line++) { |
| 133 |
if(!(res = fgets(p->data[line],128,f))) { |
| 134 |
printf("cannot read expected rows from %s, currently #%d\n",p->name,line+1); |
| 135 |
return -1; |
| 136 |
} |
| 137 |
int i; |
| 138 |
/* replace '\n' into '\0' */ |
| 139 |
for (i=0; i <= 128; i++) { |
| 140 |
if ((p->data[line][i] == '\n') && (p->data[line][i+1] == '\0')) { |
| 141 |
p->data[line][i] = '\0'; |
| 142 |
p->row_size[line] = i; |
| 143 |
break; |
| 144 |
} |
| 145 |
} |
| 146 |
} |
| 147 |
} else { |
| 148 |
printf("cannot open file '%s'\n",p->name); |
| 149 |
return -1; |
| 150 |
} |
| 151 |
fclose(f); |
| 152 |
return 0; |
| 153 |
} |
| 154 |
|
| 155 |
int raito_sum=0; |
| 156 |
int dice_parse_id() |
| 157 |
{ |
| 158 |
int i; |
| 159 |
struct timeval t; |
| 160 |
gettimeofday(&t,NULL); |
| 161 |
int seed = (int) t.tv_usec; |
| 162 |
/* cache raito_sum */ |
| 163 |
if (raito_sum == 0) { |
| 164 |
for (i=0; i < PARSE_NUM; i++) raito_sum+=dict_info[i].raito; |
| 165 |
} |
| 166 |
int rand_id = rand_r(&seed); |
| 167 |
rand_id = rand_id % raito_sum; |
| 168 |
int curr_id=0; |
| 169 |
for (i=0; i < PARSE_NUM; i++) { |
| 170 |
curr_id += dict_info[i].raito; |
| 171 |
if (curr_id > rand_id) { |
| 172 |
break; |
| 173 |
} else { |
| 174 |
|
| 175 |
} |
| 176 |
} |
| 177 |
return i; |
| 178 |
} |
| 179 |
|
| 180 |
int dice_row_id(int dict_id) |
| 181 |
{ |
| 182 |
struct timeval t; |
| 183 |
gettimeofday(&t,NULL); |
| 184 |
int seed = (int) t.tv_usec; |
| 185 |
int rand_id = rand_r(&seed); |
| 186 |
return rand_id % dict[dict_id].row_num; |
| 187 |
} |
| 188 |
|
| 189 |
int main(int argc, char *argv[]) |
| 190 |
{ |
| 191 |
if (argc != 3) { |
| 192 |
printf("usage: datagen2 [row_size] [row_num]\n"); |
| 193 |
return 0; |
| 194 |
} |
| 195 |
int gen_row_size=atoi(argv[1]); |
| 196 |
int gen_row_num=atoi(argv[2]); |
| 197 |
char *buf; |
| 198 |
|
| 199 |
int i; |
| 200 |
|
| 201 |
/* init and load dictionaries */ |
| 202 |
for (i=0; i < PARSE_NUM; i++) { |
| 203 |
dict[i].name = dict_info[i].fname; |
| 204 |
dict[i].row_num = row_num[i]; |
| 205 |
if (load_parse(&dict[i])) { |
| 206 |
printf("parse load error\n"); |
| 207 |
return -1; |
| 208 |
} |
| 209 |
} |
| 210 |
|
| 211 |
int offset; |
| 212 |
for (i=0; i < gen_row_num; i++) { |
| 213 |
buf = malloc(gen_row_size+1); |
| 214 |
offset = 0; |
| 215 |
while (1) { |
| 216 |
int dict_id = dice_parse_id(); |
| 217 |
int row_id = dice_row_id(dict_id); |
| 218 |
int word_size = dict[dict_id].row_size[row_id]; |
| 219 |
if (offset + word_size > gen_row_size) break; |
| 220 |
int j; |
| 221 |
for (j=0; j < word_size; j++) { |
| 222 |
buf[offset+j] = dict[dict_id].data[row_id][j]; |
| 223 |
} |
| 224 |
offset += word_size; |
| 225 |
} |
| 226 |
printf("%s\n",buf); |
| 227 |
free(buf); |
| 228 |
} |
| 229 |
return 0; |
| 230 |
} |
| 231 |
|
| |