• R/O
  • SSH
  • HTTPS

tsukurimashou: Commit


Commit MetaInfo

Revision492 (tree)
Time2013-12-10 04:24:42
Authormskala

Log Message

word wrap feature, and several shaven yaks

Change Summary

Incremental Difference

--- trunk/idsgrep/widths.txt (nonexistent)
+++ trunk/idsgrep/widths.txt (revision 492)
@@ -0,0 +1,5 @@
1+0000..001F;0 # controls
2+1160..11FF;0 # hangul vowels and finals
3+2460..257F;W # enclosed letters, digits, box drawing
4+2776..2793;W # circled dingbat letters and numbers
5+3248..324F;W # more circled numbers
--- trunk/idsgrep/idsgrep.h (revision 491)
+++ trunk/idsgrep/idsgrep.h (revision 492)
@@ -201,7 +201,7 @@
201201
202202 /* cook.c */
203203
204-extern int cook_output,canonicalize_input;
204+extern int cook_output,colourize_output,canonicalize_input;
205205
206206 void set_output_recipe(char *);
207207 void write_bracketed_string(HASHED_STRING *,HASHED_STRING *,FILE *f);
--- trunk/idsgrep/configure.ac (revision 491)
+++ trunk/idsgrep/configure.ac (revision 492)
@@ -273,6 +273,13 @@
273273 [enable_gcov=no])
274274 AM_CONDITIONAL([COND_GCOV],[test '!' "$enable_gcov" = no])
275275 #
276+AC_ARG_ENABLE([widthtab],
277+ [AS_HELP_STRING([--enable-widthtab],
278+ [reconstruct the character width automaton table])],
279+ [],
280+ [enable_widthtab=no])
281+AM_CONDITIONAL([COND_WIDTHTAB],[test '!' "$enable_widthtab" = no])
282+#
276283 AC_ARG_WITH([chise-ids],
277284 [AS_HELP_STRING([--with-chise-ids=PATH],
278285 [CHISE IDS database [auto]])],
--- trunk/idsgrep/Makefile.am (revision 491)
+++ trunk/idsgrep/Makefile.am (revision 492)
@@ -93,7 +93,11 @@
9393
9494 if COND_BUDDY
9595 BUDDY_LIBS=-lbdd
96+else
97+if COND_WIDTHTAB
98+$(error Cannot use --enable-widthtab without BuDDy)
9699 endif
100+endif
97101 if COND_CHISE_IDS
98102 MAYBE_CIDATA=chise.eids chise.bvec
99103 endif
@@ -180,6 +184,7 @@
180184 $(GCOV_TESTS) test/vgneko test/rmgcda test/gcov
181185
182186 bin_PROGRAMS = idsgrep
187+noinst_PROGRAMS = mkwcw
183188
184189 dict_DATA = $(MAYBE_CJKVIDATA) $(MAYBE_CIDATA) $(MAYBE_EDICTDATA) \
185190 $(MAYBE_KVDATA) $(MAYBE_TSUKUDATA)
@@ -189,8 +194,10 @@
189194 AM_CFLAGS := $(MAYBE_COVERAGE) $(PCRE_CFLAGS) $(AM_CFLAGS)
190195 idsgrep_SOURCES = \
191196 assoc.c bitvec.c cook.c hash.c idsgrep.c idsgrep.h match.c \
192- parse.c regex.c unilist.c userpred.c
197+ parse.c regex.c unilist.c userpred.c widthtab.c
193198
199+mkwcw_SOURCES = mkwcw.c
200+
194201 LDADD = @LIBOBJS@ $(PCRE_LIBS) $(BUDDY_LIBS)
195202
196203 man1_MANS = idsgrep.1
@@ -268,6 +275,16 @@
268275
269276 ############################################################################
270277
278+# BUILD THE WIDTH TABLE
279+
280+if COND_WIDTHTAB
281+widthtab.c: mkwcw widths.txt EastAsianWidth.txt UnicodeData.txt
282+ cat widths.txt EastAsianWidth.txt | ./mkwcw N UnicodeData.txt \
283+ > widthtab.c
284+endif
285+
286+############################################################################
287+
271288 # AUTOMAKE'S RULES WILL GO HERE
272289
273290 automake_rules = here
--- trunk/idsgrep/mkwcw.c (nonexistent)
+++ trunk/idsgrep/mkwcw.c (revision 492)
@@ -0,0 +1,549 @@
1+/*
2+ * Wide-character-width function summarizer
3+ * Copyright (C) 2013 Matthew Skala
4+ *
5+ * This program is free software: you can redistribute it and/or modify
6+ * it under the terms of the GNU General Public License as published by
7+ * the Free Software Foundation, version 3.
8+ *
9+ * This program is distributed in the hope that it will be useful,
10+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+ * GNU General Public License for more details.
13+ *
14+ * You should have received a copy of the GNU General Public License
15+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
16+ *
17+ * Matthew Skala
18+ * http://ansuz.sooke.bc.ca/
19+ * mskala@ansuz.sooke.bc.ca
20+ */
21+
22+/*
23+ * This is meta-code, not intended for end-user use. Its basic function
24+ * is to read the EastAsianWidth.txt data file of Unicode Standard Annex #11,
25+ * or other data more or less in that format; build an optimized binary
26+ * decision diagram implementing the function that determines whether the
27+ * UTF-8 byte sequence for a character encodes a wide (two-column) character;
28+ * and then write a fragment of C code to evaluate that function. This
29+ * meta-code requires the BuDDy binary decision diagram library, but the
30+ * code it generates does not.
31+ */
32+
33+/*
34+ * Invocation:
35+ * stdin - EastAsianWidth.txt data (or other data in that format). In
36+ * case of more than one line mentioning the same character, earlier
37+ * mentions take priority unless ignored as ambiguous
38+ * stdout - C source code
39+ * argv[1] if specified - W, N, or 0 to treat A (ambiguous) lines as
40+ * Wide, Narrow, or zero-width. Specify A or do not specify for the
41+ * default treatment of ignoring them.
42+ * argv[2] if specified - UnicodeData.txt file, scanned for control and
43+ * combining characters, which will become zero-width.
44+ */
45+
46+/*
47+ * Note that the return value of the generated code on byte sequences that
48+ * are not the correct UTF-8 encodings of characters specified unambiguously
49+ * by the input, is not defined! Such byte sequences will at least return
50+ * "wide" or "narrow," but which one will be whichever seems to give the most
51+ * compact binary decision diagram. Specify dispositions for all characters
52+ * you care about, and do not run the generated code on things other than
53+ * valid UTF-8.
54+ */
55+
56+#include <stdio.h>
57+#include <stdlib.h>
58+
59+#include <bdd.h>
60+
61+#include "_stdint.h"
62+
63+/**********************************************************************/
64+
65+BDD bits_range_bdd(uint32_t low_bits,uint32_t high_bits) {
66+ BDD rval,tail,newbdd;
67+ int i;
68+ uint32_t b;
69+
70+ if (low_bits>0) {
71+ tail=bddtrue;
72+ for (i=31,b=1;i>=0;i--,b<<=1) {
73+ if (low_bits&b)
74+ newbdd=bdd_addref(bdd_and(bdd_ithvar(i),tail));
75+ else
76+ newbdd=bdd_addref(bdd_or(bdd_ithvar(i),tail));
77+ bdd_delref(tail);
78+ tail=newbdd;
79+ }
80+ rval=tail;
81+ } else
82+ rval=bddtrue;
83+
84+ if (high_bits<UINT32_MAX) {
85+ tail=bddtrue;
86+ for (i=31,b=1;i>=0;i--,b<<=1) {
87+ if (high_bits&b)
88+ newbdd=bdd_addref(bdd_or(bdd_nithvar(i),tail));
89+ else
90+ newbdd=bdd_addref(bdd_and(bdd_nithvar(i),tail));
91+ bdd_delref(tail);
92+ tail=newbdd;
93+ }
94+ rval=bdd_addref(bdd_and(rval,tail));
95+ bdd_delref(tail);
96+ }
97+
98+ return rval;
99+}
100+
101+BDD code_range_bdd(uint32_t low_code,uint32_t high_code) {
102+ BDD rval,subrange,srx;
103+
104+ if (low_code<0x80)
105+ rval=bits_range_bdd((low_code<<24),
106+ (high_code>=0x80?
107+ 0x7FFFFFFF:
108+ high_code<<24));
109+ else
110+ rval=bddfalse;
111+
112+ if ((low_code<0x800) && (high_code>=0x80)) {
113+ subrange=bits_range_bdd((low_code<0x80?
114+ 0xC2800000:
115+ ((low_code& 0x3F)<<16)+
116+ ((low_code&0x7C0)<<18)+
117+ 0xC0800000),
118+ (high_code>=0x800?
119+ 0xDFBFFFFF:
120+ ((high_code& 0x3F)<<16)+
121+ ((high_code&0x7C0)<<18)+
122+ 0xC080FFFF));
123+
124+ srx=bdd_addref(bdd_and(subrange,bdd_nithvar(9)));
125+ bdd_delref(subrange);
126+ subrange=bdd_addref(bdd_and(srx,bdd_ithvar(8)));
127+ bdd_delref(srx);
128+ srx=bdd_addref(bdd_or(rval,subrange));
129+ bdd_delref(rval);
130+ bdd_delref(subrange);
131+ rval=srx;
132+ }
133+
134+ if ((low_code<0x10000) && (high_code>=0x800)) {
135+ subrange=bits_range_bdd((low_code<0x800?
136+ 0xE0A00000:
137+ ((low_code& 0x3F)<<8)+
138+ ((low_code& 0xFC0)<<10)+
139+ ((low_code&0xF000)<<12)+
140+ 0xE0808000),
141+ (high_code>=0x10000?
142+ 0xEFBFBFFF:
143+ ((high_code& 0x3F)<<8)+
144+ ((high_code& 0xFC0)<<10)+
145+ ((high_code&0xF000)<<12)+
146+ 0xE08080FF));
147+
148+ srx=bdd_addref(bdd_and(subrange,bdd_nithvar(17)));
149+ bdd_delref(subrange);
150+ subrange=bdd_addref(bdd_and(srx,bdd_ithvar(16)));
151+ bdd_delref(srx);
152+ srx=bdd_addref(bdd_and(subrange,bdd_nithvar(9)));
153+ bdd_delref(subrange);
154+ subrange=bdd_addref(bdd_and(srx,bdd_ithvar(8)));
155+ bdd_delref(srx);
156+ srx=bdd_addref(bdd_or(rval,subrange));
157+ bdd_delref(rval);
158+ bdd_delref(subrange);
159+ rval=srx;
160+ }
161+
162+ if (high_code>=0x10000) {
163+ subrange=bits_range_bdd((low_code<0x10000?
164+ 0xF0900000:
165+ (low_code& 0x3F)+
166+ ((low_code& 0xFC0)<<2)+
167+ ((low_code& 0x3F000)<<4)+
168+ ((low_code&0x1C0000)<<6)+
169+ 0xF0808080),
170+ (high_code& 0x3F)+
171+ ((high_code& 0xFC0)<<2)+
172+ ((high_code& 0x3F000)<<4)+
173+ ((high_code&0x1C0000)<<6)+
174+ 0xF0808080);
175+
176+ srx=bdd_addref(bdd_and(subrange,bdd_nithvar(25)));
177+ bdd_delref(subrange);
178+ subrange=bdd_addref(bdd_and(srx,bdd_ithvar(24)));
179+ bdd_delref(srx);
180+ srx=bdd_addref(bdd_and(subrange,bdd_nithvar(17)));
181+ bdd_delref(subrange);
182+ subrange=bdd_addref(bdd_and(srx,bdd_ithvar(16)));
183+ bdd_delref(srx);
184+ srx=bdd_addref(bdd_and(subrange,bdd_nithvar(9)));
185+ bdd_delref(subrange);
186+ subrange=bdd_addref(bdd_and(srx,bdd_ithvar(8)));
187+ bdd_delref(srx);
188+ srx=bdd_addref(bdd_or(rval,subrange));
189+ bdd_delref(rval);
190+ bdd_delref(subrange);
191+ rval=srx;
192+ }
193+
194+ return rval;
195+}
196+
197+/**********************************************************************/
198+
199+static BDD defined_codes,zero_codes,wide_codes;
200+
201+void set_range_width(uint32_t low_code,uint32_t high_code,int width) {
202+ BDD x,y;
203+
204+ /* sanity check */
205+ if (low_code>high_code)
206+ return;
207+
208+ /* make a BDD for the range */
209+ x=code_range_bdd(low_code,high_code);
210+
211+ /* if (in range) /\ (already defined) === true, we've been pre-empted */
212+ y=bdd_and(x,defined_codes); /* note no addref, we don't need it */
213+ if (y==x) {
214+ bdd_delref(x);
215+ return;
216+ }
217+
218+ /* if (in range) /\ (already defined) !== false, must split */
219+ if (y!=bddfalse) {
220+ bdd_delref(x);
221+ set_range_width(low_code,(low_code+high_code)/2,width);
222+ set_range_width((low_code+high_code)/2+1,high_code,width);
223+ return;
224+ }
225+
226+ y=bdd_addref(bdd_or(x,defined_codes));
227+ bdd_delref(defined_codes);
228+ defined_codes=y;
229+
230+ if (width==0) {
231+ y=bdd_addref(bdd_or(x,zero_codes));
232+ bdd_delref(zero_codes);
233+ zero_codes=y;
234+ } else if (width==2) {
235+ y=bdd_addref(bdd_or(x,wide_codes));
236+ bdd_delref(wide_codes);
237+ wide_codes=y;
238+ }
239+
240+ bdd_delref(x);
241+}
242+
243+/**********************************************************************/
244+
245+typedef enum _PARSE_STATE {
246+ psLOW,psHIGH,psWIDTH,psSEMI,psSTOP
247+} PARSE_STATE;
248+
249+static BDD reorder_focus;
250+
251+int reordering_size_callback(void) {
252+ return reorder_focus==bddfalse?
253+ bdd_getnodenum():bdd_nodecount(reorder_focus);
254+}
255+
256+int main(int argc,char **argv) {
257+ char ambiguous_treatment='A';
258+ char linebuff[1024];
259+ char *parseptr;
260+ PARSE_STATE ps;
261+ uint32_t low_code,high_code;
262+ int width,i,j,vi,vj;
263+ FILE *unicode_db;
264+ BDD x,y,child[8];
265+ BDD *queue;
266+ int queue_low,queue_high,queue_max;
267+
268+ puts("/*\n"
269+ " * GENERATED CODE - DO NOT EDIT!\n"
270+ " * Edit mkwcw.c, which generates this, or the input to that\n"
271+ " * program, instead. Distributions of IDSgrep will nonetheless\n"
272+ " * usually include a ready-made copy of this file because\n"
273+ " * compiling and running mkwcw.c requires a library and data\n"
274+ " * file that, although free, not everyone is expected to have.\n"
275+ " */\n\n"
276+ "#include \"_stdint.h\"\n"
277+ );
278+
279+ if (argc>1)
280+ ambiguous_treatment=argv[1][0]&~32;
281+
282+ bdd_init(1000000,15625);
283+ bdd_setcacheratio(64);
284+ bdd_setvarnum(32);
285+ bdd_gbc_hook(NULL);
286+
287+ defined_codes=bddfalse;
288+ zero_codes=bddfalse;
289+ wide_codes=bddfalse;
290+
291+ /* yes, unfortunately UnicodeData.txt and EastAsianWidth.txt are just
292+ * different enough to need separate parsers, at least if the parsers
293+ * are as stupid as I'd like these ones to be */
294+
295+ if (argc>2) {
296+ unicode_db=fopen(argv[2],"rt");
297+
298+ while (1) {
299+ fgets(linebuff,sizeof(linebuff),unicode_db);
300+ if (feof(unicode_db))
301+ break;
302+
303+ ps=psLOW;
304+ linebuff[sizeof(linebuff)-1]='\0';
305+ low_code=0;
306+ width=-1;
307+
308+ for (parseptr=linebuff;(*parseptr) && (ps!=psSTOP);parseptr++)
309+ switch (ps) {
310+
311+ case psLOW:
312+ if ((*parseptr>='0') && (*parseptr<='9'))
313+ low_code=(low_code<<4)+(*parseptr-'0');
314+ else if ((*parseptr>='a') && (*parseptr<='f'))
315+ low_code=(low_code<<4)+(*parseptr-'a'+10);
316+ else if ((*parseptr>='A') && (*parseptr<='F'))
317+ low_code=(low_code<<4)+(*parseptr-'A'+10);
318+ else if (*parseptr==';')
319+ ps=psSEMI;
320+ else if ((*parseptr==' ') || (*parseptr=='\t'))
321+ { /* skip spaces and tabs */ }
322+ else
323+ ps=psSTOP; /* this catches comment lines */
324+ break;
325+
326+ case psSEMI:
327+ if (*parseptr==';')
328+ ps=psWIDTH;
329+ break;
330+
331+ case psWIDTH:
332+ if (((parseptr[0]=='M') && ((parseptr[1]=='e') ||
333+ (parseptr[1]=='n'))) ||
334+ ((parseptr[0]=='C') && (parseptr[1]=='f')))
335+ width=0;
336+ /* FALL THROUGH */
337+
338+ default:
339+ ps=psSTOP;
340+ break;
341+ }
342+
343+ if (width==0)
344+ set_range_width(low_code,low_code,0);
345+ }
346+
347+ fclose(unicode_db);
348+ }
349+
350+ while (1) {
351+ fgets(linebuff,sizeof(linebuff),stdin);
352+ if (feof(stdin))
353+ break;
354+
355+ ps=psLOW;
356+ linebuff[sizeof(linebuff)-1]='\0';
357+ low_code=0;
358+ high_code=0;
359+ width=-1;
360+
361+ for (parseptr=linebuff;(*parseptr) && (ps!=psSTOP);parseptr++)
362+ switch (ps) {
363+
364+ case psLOW:
365+ if ((*parseptr>='0') && (*parseptr<='9'))
366+ low_code=(low_code<<4)+(*parseptr-'0');
367+ else if ((*parseptr>='a') && (*parseptr<='f'))
368+ low_code=(low_code<<4)+(*parseptr-'a'+10);
369+ else if ((*parseptr>='A') && (*parseptr<='F'))
370+ low_code=(low_code<<4)+(*parseptr-'A'+10);
371+ else if (*parseptr=='.')
372+ ps=psHIGH;
373+ else if (*parseptr==';') {
374+ high_code=low_code;
375+ ps=psWIDTH;
376+ } else if ((*parseptr==' ') || (*parseptr=='\t'))
377+ { /* skip spaces and tabs */ }
378+ else
379+ ps=psSTOP; /* this catches comment lines */
380+ break;
381+
382+ case psHIGH:
383+ if ((*parseptr>='0') && (*parseptr<='9'))
384+ high_code=(high_code<<4)+(*parseptr-'0');
385+ else if ((*parseptr>='a') && (*parseptr<='f'))
386+ high_code=(high_code<<4)+(*parseptr-'a'+10);
387+ else if ((*parseptr>='A') && (*parseptr<='F'))
388+ high_code=(high_code<<4)+(*parseptr-'A'+10);
389+ else if ((*parseptr=='.') || (*parseptr==' ') || (*parseptr=='\t'))
390+ { /* skip spaces, tabs, and dots */ }
391+ else if (*parseptr==';')
392+ ps=psWIDTH;
393+ else
394+ ps=psSTOP;
395+ break;
396+
397+ case psWIDTH:
398+ if (*parseptr=='A')
399+ *parseptr=ambiguous_treatment;
400+ switch (*parseptr) {
401+ case 'F': /* full-width treated as wide */
402+ case 'W': /* wide */
403+ width=2;
404+ break;
405+
406+ case 'H': /* half-width treated as narrow */
407+ case 'N': /* narrow or neutral */
408+ width=1;
409+ break;
410+
411+ case '0': /* zero-width - should only appear in user database */
412+ width=0;
413+ break;
414+
415+ default:
416+ /* ignore all others */
417+ break;
418+ }
419+ /* FALL THROUGH */
420+
421+ default:
422+ ps=psSTOP;
423+ break;
424+ }
425+
426+ if (width>=0)
427+ set_range_width(low_code,high_code,width);
428+ }
429+
430+ printf("/* node counts before simplification: %d %d %d */\n",
431+ bdd_nodecount(defined_codes),
432+ bdd_nodecount(zero_codes),
433+ bdd_nodecount(wide_codes));
434+
435+ x=bdd_addref(bdd_simplify(wide_codes,defined_codes));
436+ bdd_delref(wide_codes);
437+ wide_codes=x;
438+
439+ x=bdd_addref(bdd_apply(defined_codes,wide_codes,bddop_diff));
440+ bdd_delref(defined_codes);
441+ defined_codes=x;
442+
443+ x=bdd_addref(bdd_simplify(zero_codes,defined_codes));
444+ bdd_delref(zero_codes);
445+ zero_codes=x;
446+
447+ printf("/* node counts after simplification: %d %d %d */\n\n",
448+ bdd_nodecount(defined_codes),
449+ bdd_nodecount(zero_codes),
450+ bdd_nodecount(wide_codes));
451+
452+ bdd_varblockall();
453+ bdd_intaddvarblock(0,7,0);
454+ bdd_intaddvarblock(8,15,0);
455+ bdd_intaddvarblock(16,23,0);
456+ bdd_intaddvarblock(24,31,0);
457+ bdd_intaddvarblock(0,31,1);
458+
459+ bdd_reorder_probe(&reordering_size_callback);
460+
461+ puts("typedef struct _WIDTH_BBD_ENT {\n"
462+ " int16_t child[8];\n"
463+ " char byte,shift;\n"
464+ "} WIDTH_BDD_ENT;\n\n"
465+ "static WIDTH_BDD_ENT width_bdd[]={");
466+
467+ queue=(BDD *)malloc(sizeof(BDD)*1000);
468+ queue_max=1000;
469+ queue_low=2;
470+ queue_high=4;
471+ queue[0]=bddfalse;
472+ queue[1]=bddtrue;
473+ queue[2]=wide_codes;
474+ queue[3]=zero_codes;
475+
476+ while (queue_low<queue_high) {
477+ if (queue_high+8>queue_max) {
478+ queue_max/=3;
479+ queue_max*=4;
480+ queue=(BDD *)realloc(queue,sizeof(BDD)*queue_max);
481+ }
482+
483+ reorder_focus=queue[queue_low];
484+ bdd_reorder(BDD_REORDER_WIN2ITE);
485+
486+ vj=bdd_var(queue[queue_low]);
487+ vi=(vj/8)*8;
488+ vj=((vj-vi+1)/3)*3-1;
489+ if (vj<0) vj=0;
490+
491+ x=bdd_addref(bdd_restrict(queue[queue_low],bdd_nithvar(vi+vj)));
492+ y=bdd_addref(bdd_restrict(x,bdd_nithvar(vi+vj+1)));
493+ child[0]=bdd_addref(bdd_restrict(y,bdd_nithvar(vi+vj+2)));
494+ child[1]=bdd_addref(bdd_restrict(y,bdd_ithvar(vi+vj+2)));
495+ bdd_delref(y);
496+ y=bdd_addref(bdd_restrict(x,bdd_ithvar(vi+vj+1)));
497+ child[2]=bdd_addref(bdd_restrict(y,bdd_nithvar(vi+vj+2)));
498+ child[3]=bdd_addref(bdd_restrict(y,bdd_ithvar(vi+vj+2)));
499+ bdd_delref(y);
500+ bdd_delref(x);
501+ x=bdd_addref(bdd_restrict(queue[queue_low],bdd_ithvar(vi+vj)));
502+ y=bdd_addref(bdd_restrict(x,bdd_nithvar(vi+vj+1)));
503+ child[4]=bdd_addref(bdd_restrict(y,bdd_nithvar(vi+vj+2)));
504+ child[5]=bdd_addref(bdd_restrict(y,bdd_ithvar(vi+vj+2)));
505+ bdd_delref(y);
506+ y=bdd_addref(bdd_restrict(x,bdd_ithvar(vi+vj+1)));
507+ child[6]=bdd_addref(bdd_restrict(y,bdd_nithvar(vi+vj+2)));
508+ child[7]=bdd_addref(bdd_restrict(y,bdd_ithvar(vi+vj+2)));
509+ bdd_delref(y);
510+ bdd_delref(x);
511+
512+ fputs(" {{",stdout);
513+ for (i=0;i<8;i++) {
514+ queue[queue_high]=child[i];
515+ for (j=0;queue[j]!=child[i];j++);
516+ if (j==queue_high)
517+ queue_high++;
518+ else
519+ bdd_delref(child[i]);
520+ printf("%d",j-2);
521+ if (i<7) putchar(',');
522+ }
523+ printf("},%d,%d},\n",vi/8,5-vj);
524+
525+ queue_low++;
526+ }
527+
528+ puts("};\n\n"
529+"int idsgrep_utf8cw(char *);\n"
530+"\n"
531+"#define WBS width_bdd[search]\n"
532+"\n"
533+"int idsgrep_utf8cw(char *cp) {\n"
534+" int search;\n"
535+"\n"
536+" for (search=0;search>=0;)\n"
537+" search=WBS.child[(cp[WBS.byte]>>WBS.shift)&7];\n"
538+" if (search==-1)\n"
539+" return 2;\n"
540+" for (search=1;search>=0;)\n"
541+" search=WBS.child[(cp[WBS.byte]>>WBS.shift)&7];\n"
542+" return ((-1)-search);\n"
543+"}\n");
544+
545+ bdd_done();
546+
547+ exit(0);
548+}
549+
--- trunk/idsgrep/cook.c (revision 491)
+++ trunk/idsgrep/cook.c (revision 492)
@@ -26,6 +26,7 @@
2626 #include "idsgrep.h"
2727
2828 int cook_output=0;
29+int colourize_output=0;
2930 int canonicalize_input=1;
3031
3132 #define OS_TOP_HEAD_BRACKET_TYPE 0 /* 0 ASCII, 1 B lentic., 2 W lentic. */
@@ -34,7 +35,7 @@
3435 #define OS_UNARY_BRACKET_TYPE 3 /* 0 period, 1 colon, 2 centre dot */
3536 #define OS_BINARY_BRACKET_TYPE 4 /* 0 square bckt, 1 wide, 2 dbl wide */
3637 #define OS_TERNARY_BRACKET_TYPE 5 /* 0 curly brace, 1 B tort, 2 W tort */
37-#define OS_INDENTATION 6 /* 8=tab, else # of spaces */
38+#define OS_INDENTATION 6 /* 8=tab, 9=wrap, else # of spaces */
3839 #define OS_SEPARATOR 7 /* 0 null, 1 \n, 2 \n\n, 3 nothing */
3940 #define OS_SUGAR 8 /* 4 syrup @top +2 not @top +1 sugar */
4041 #define OS_ESCAPE_WHAT 9 /* increasing subsets from 0 to 7 */
@@ -43,13 +44,18 @@
4344
4445 #define NUM_OUTPUT_SETTINGS 12
4546
46-static char output_recipe[NUM_OUTPUT_SETTINGS]="100000013250";
47+static char output_recipe[NUM_OUTPUT_SETTINGS]="100000913250";
4748
49+static char *bracketed_colours[6]={
50+ "0;41;30","0;37","0;32","0;35","0;36","0;33"};
51+static char *sweetened_colours[6]={
52+ "0;41;30","1;37","1;32","1;35","1;36","1;33"};
53+
4854 #define NUM_PRESET_RECIPES 5
4955
5056 static struct {char *name,*recipe;} preset_recipe[NUM_PRESET_RECIPES]={
51- {"ascii", "000000013551"},
52- {"cooked","100000013250"},
57+ {"ascii", "000000913551"},
58+ {"cooked","100000913250"},
5359 {"indent","100000223250"},
5460 {"raw", "000000000000"},
5561 {"rawnc", "000000000004"},
@@ -100,8 +106,81 @@
100106
101107 /**********************************************************************/
102108
109+static char *wrap_buffer=NULL;
110+static int buffer_size=0,buffered_bytes=0,buffered_columns=0,
111+ current_column=0,wrap_allowed=0;
112+
113+void wrap_write(char *cp,int len,FILE *f) {
114+ int i,j;
115+
116+ if (output_recipe[OS_INDENTATION]!='9') {
117+ fwrite(cp,1,len,f);
118+ return;
119+ }
120+
121+ for (i=0;i<len;) {
122+ if ((cp[i]=='\n') || (cp[i]=='\f')) {
123+ fwrite(wrap_buffer,1,buffered_bytes,f);
124+ fputc(cp[i],f);
125+ buffered_bytes=0;
126+ buffered_columns=0;
127+ current_column=0;
128+ wrap_allowed=0;
129+
130+ i++;
131+
132+ } else if (wrap_allowed) {
133+ if (buffer_size<buffered_bytes+4) {
134+ if (wrap_buffer==NULL) {
135+ buffer_size=80*4;
136+ wrap_buffer=(char *)malloc(buffer_size);
137+ } else {
138+ buffer_size/=3;
139+ buffer_size*=4;
140+ wrap_buffer=(char *)realloc(wrap_buffer,buffer_size);
141+ }
142+ }
143+
144+ buffered_columns+=idsgrep_utf8cw(cp+i);
145+ while (1) {
146+ wrap_buffer[buffered_bytes++]=cp[i++];
147+ if ((cp[i]&0xC0)!=0x80)
148+ break;
149+ };
150+
151+ if (current_column+buffered_columns>76) {
152+ fwrite("\n ",1,4,f);
153+ fwrite(wrap_buffer,1,buffered_bytes,f);
154+ current_column=3+buffered_columns;
155+ buffered_columns=0;
156+ buffered_bytes=0;
157+ wrap_allowed=0;
158+ }
159+
160+ } else {
161+ j=char_length(cp+i);
162+ fwrite(cp+i,1,j,f);
163+ current_column+=idsgrep_utf8cw(cp+i);
164+
165+ i+=j;
166+ }
167+ }
168+}
169+
170+void wrap_flush(FILE *f) {
171+ if (buffered_bytes>0) {
172+ fwrite(wrap_buffer,1,buffered_bytes,f);
173+ current_column+=buffered_columns;
174+ buffered_columns=0;
175+ buffered_bytes=0;
176+ }
177+}
178+
179+/**********************************************************************/
180+
103181 void write_maybe_escaped_char(char *cp,HASHED_STRING *br,FILE *f) {
104- int c,do_esc;
182+ int c,do_esc,i;
183+ char out_buffer[11];
105184
106185 switch (char_length(cp)) {
107186 case 1:
@@ -152,7 +231,7 @@
152231 case '3':
153232 if (((output_recipe[OS_ESCAPE_HOW]=='3') || (c>=0x7F)) &&
154233 (c<=0xFF)) {
155- fprintf(f,"\\x%02X",c);
234+ sprintf(out_buffer,"\\x%02X",c);
156235 break;
157236 }
158237 /* FALL THROUGH */
@@ -160,7 +239,7 @@
160239 case '4':
161240 if (((output_recipe[OS_ESCAPE_HOW]=='4') || (c>0xFF)) &&
162241 (c<=0xFFFF)) {
163- fprintf(f,"\\X%04X",c);
242+ sprintf(out_buffer,"\\X%04X",c);
164243 break;
165244 }
166245 /* FALL THROUGH */
@@ -167,7 +246,7 @@
167246
168247 case '2':
169248 if ((output_recipe[OS_ESCAPE_HOW]=='2') || (c>0xFFFF)) {
170- fprintf(f,"\\x{%X}",c);
249+ sprintf(out_buffer,"\\x{%X}",c);
171250 break;
172251 }
173252 /* FALL THROUGH */
@@ -174,32 +253,34 @@
174253
175254 case '1':
176255 if ((c>=1) && (c<=27)) {
177- fputc('\\',f);
256+ out_buffer[0]='\\';
257+ out_buffer[2]='\0';
178258 switch (c) {
179259 case 7:
180- fputc('a',f);
260+ out_buffer[1]='a';
181261 break;
182262 case 8:
183- fputc('b',f);
263+ out_buffer[1]='b';
184264 break;
185265 case 27:
186- fputc('e',f);
266+ out_buffer[1]='e';
187267 break;
188268 case 12:
189- fputc('f',f);
269+ out_buffer[1]='f';
190270 break;
191271 case 9:
192- fputc('t',f);
272+ out_buffer[1]='t';
193273 break;
194274 case 10:
195- fputc('n',f);
275+ out_buffer[1]='n';
196276 break;
197277 case 13:
198- fputc('r',f);
278+ out_buffer[1]='r';
199279 break;
200280 default:
201- fputc('c',f);
202- fputc(c+'A'-1,f);
281+ out_buffer[1]='c';
282+ out_buffer[2]=c+'A'-1;
283+ out_buffer[3]='\0';
203284 break;
204285 }
205286 break;
@@ -210,22 +291,37 @@
210291 default:
211292 if ((output_recipe[OS_ESCAPE_HOW]=='5') &&
212293 ((c<=0x1F) || (c==0x7F))) {
213- fprintf(f,"\\x%02X",c);
294+ sprintf(out_buffer,"\\x%02X",c);
214295 } else {
215296 if (((c|0x20)<'a') || ((c|0x20)>'z'))
216- fputc('\\',f);
217- fwrite(cp,1,char_length(cp),f);
297+ out_buffer[0]='\\';
298+ i=1;
299+ while (1) {
300+ out_buffer[i]=cp[i-1];
301+ i++;
302+ if ((cp[i]&0xC0)!=0x80)
303+ break;
304+ }
305+ out_buffer[i]='\0';
218306 }
219307 break;
220308 }
309+
310+ wrap_write(out_buffer,strlen(out_buffer),f);
311+
221312 } else
222- fwrite(cp,1,char_length(cp),f);
313+ wrap_write(cp,char_length(cp),f);
223314 }
224315
225316 void write_bracketed_string(HASHED_STRING *hs,HASHED_STRING *br,FILE *f) {
226317 int i;
227318
228- fwrite(br->data,1,br->length,f);
319+ wrap_flush(f);
320+ if (colourize_output)
321+ fprintf(f,"\e[%sm",bracketed_colours[br->arity+2]);
322+
323+ wrap_allowed=1;
324+ wrap_write(br->data,br->length,f);
229325 for (i=0;i<hs->length;i+=char_length(hs->data+i)) {
230326 if ((i==0) && (output_recipe[OS_ESCAPE_WHAT]<'6'))
231327 write_maybe_escaped_char(hs->data+i,NULL,f);
@@ -232,7 +328,7 @@
232328 else
233329 write_maybe_escaped_char(hs->data+i,br->mate,f);
234330 }
235- fwrite(br->mate->data,1,br->mate->length,f);
331+ wrap_write(br->mate->data,br->mate->length,f);
236332 }
237333
238334 /**********************************************************************/
@@ -254,15 +350,17 @@
254350 tail=ms->child[i];
255351 }
256352
257- if ((output_recipe[OS_INDENTATION]!='0') && (ms->complete>0))
258- fputc('\n',f);
259- if (output_recipe[OS_INDENTATION]=='8')
260- for (i=0;i<ms->complete;i++)
261- fputc('\t',f);
262- else
263- for (i=0;i<(ms->complete*(output_recipe[OS_INDENTATION]-'0'));i++)
264- fputc(' ',f);
265-
353+ if (output_recipe[OS_INDENTATION]!='9') {
354+ if ((output_recipe[OS_INDENTATION]!='0') && (ms->complete>0))
355+ fputc('\n',f);
356+ if (output_recipe[OS_INDENTATION]=='8')
357+ for (i=0;i<ms->complete;i++)
358+ fputc('\t',f);
359+ else
360+ for (i=0;i<(ms->complete*(output_recipe[OS_INDENTATION]-'0'));i++)
361+ fputc(' ',f);
362+ }
363+
266364 if (((output_recipe[OS_SUGAR]&2) || (ms->complete==0)) &&
267365 ((output_recipe[OS_SUGAR]&4) || (ms->complete>0)) &&
268366 (ms->head!=NULL) &&
@@ -271,6 +369,10 @@
271369 (ms->head->arity==-2) &&
272370 (ms->arity==0) &&
273371 (ms->functor==semicolon)) {
372+ wrap_flush(f);
373+ if (colourize_output)
374+ fprintf(f,"\e[%sm",sweetened_colours[1]);
375+ wrap_allowed=1;
274376 write_maybe_escaped_char(ms->head->data,NULL,f);
275377
276378 } else {
@@ -307,7 +409,11 @@
307409 (mf->arity==ms->arity) &&
308410 (mf->mate==NULL) &&
309411 (char_length(mf->data)==mf->length)) {
310- fwrite(mf->data,mf->length,1,f);
412+ wrap_flush(f);
413+ if (colourize_output)
414+ fprintf(f,"\e[%sm",sweetened_colours[mf->arity+2]);
415+ wrap_allowed=1;
416+ wrap_write(mf->data,mf->length,f);
311417
312418 } else {
313419 i=output_recipe[OS_NULLARY_BRACKET_TYPE+ms->arity]-'0';
@@ -320,6 +426,10 @@
320426 ms=tail;
321427 }
322428
429+ wrap_flush(f);
430+ if (colourize_output)
431+ fwrite("\e[0;37m",7,1,f);
432+
323433 switch (output_recipe[OS_SEPARATOR]) {
324434 case '0':
325435 fputc('\0',f);
@@ -326,14 +436,13 @@
326436 break;
327437 /* 1 is default newline */
328438 case '2':
329- fputc('\n',f);
330- fputc('\n',f);
439+ wrap_write("\n\n",2,f);
331440 break;
332441 case '3':
333442 /* 3 is nothing */
334443 break;
335444 default:
336- fputc('\n',f);
445+ wrap_write("\n",1,f);
337446 break;
338447 }
339448
--- trunk/idsgrep/idsgrep.c (revision 491)
+++ trunk/idsgrep/idsgrep.c (revision 492)
@@ -417,6 +417,8 @@
417417 static struct option long_opts[] = {
418418 {"bitvec-debug",no_argument,NULL,'D'|128},
419419 {"statistics",no_argument,NULL,'s'|128},
420+ {"color",optional_argument,NULL,'C'},
421+ {"colour",optional_argument,NULL,'C'},
420422 {"cooking",required_argument,NULL,'c'},
421423 {"dictionary",optional_argument,NULL,'d'},
422424 {"font-chars",required_argument,NULL,'f'},
@@ -451,8 +453,17 @@
451453 register_syntax();
452454
453455 /* loop on command-line options */
454- while ((c=getopt_long(argc,argv,"GIU::Vc:d::f:h",long_opts,NULL))!=-1) {
456+ while ((c=getopt_long(argc,argv,"CGIU::Vc:d::f:h",long_opts,NULL))!=-1) {
455457 switch (c) {
458+
459+ case 'C':
460+ if ((!optarg) || (optarg[0]=='\0') || (!strcmp(optarg,"auto")))
461+ colourize_output=isatty(fileno(stdout));
462+ else
463+ colourize_output=!strcmp(optarg,"always");
464+ if (colourize_output)
465+ cook_output=1;
466+ break;
456467
457468 case 'G':
458469 generate_index=1;
Show on old repository browser