• R/O
  • SSH
  • HTTPS

tsukurimashou: Commit


Commit MetaInfo

Revision447 (tree)
Time2013-07-27 04:35:02
Authormskala

Log Message

bit vectors for IDSgrep, start of implementation

Change Summary

Incremental Difference

--- trunk/idsgrep/idsgrep.h (revision 446)
+++ trunk/idsgrep/idsgrep.h (revision 447)
@@ -19,6 +19,8 @@
1919 * mskala@ansuz.sooke.bc.ca
2020 */
2121
22+#include <sys/types.h>
23+
2224 #include "config.h"
2325 #include "_stdint.h"
2426
@@ -28,7 +30,11 @@
2830
2931 /**********************************************************************/
3032
33+struct _NODE;
34+struct _BIT_FILTER;
35+
3136 typedef struct _NODE *(*MATCH_FN)(struct _NODE *);
37+typedef void (*BITVEC_FN)(struct _NODE *,struct _BIT_FILTER *);
3238
3339 typedef enum _MATCH_RESULT {
3440 MR_INITIAL=0,
@@ -45,6 +51,7 @@
4551 size_t length;
4652 int refs,arity;
4753 MATCH_FN match_fn;
54+ BITVEC_FN needle_bits_fn;
4855 #ifdef HAVE_PCRE
4956 pcre *pcre_compiled;
5057 pcre_extra *pcre_studied;
@@ -84,6 +91,33 @@
8491
8592 /**********************************************************************/
8693
94+/* For consistency with Skala et al. (ACL 2010), LAMBDA is one *less*
95+ * than the number of bits set per item inserted in the Bloom filter,
96+ * i.e. the maximum number of bits that may be set and still have the
97+ * unification result be "fail." However, in this code we are closer
98+ * to a pure Bloom filter than in that paper, because here we choose
99+ * bit indices with replacement, accepting some collisions in the
100+ * interest of making the vectors a little less dense.
101+ */
102+
103+#define LAMBDA 3
104+
105+typedef struct _INDEX_HEADER {
106+ uint32_t magica,despell;
107+} INDEX_HEADER;
108+
109+typedef struct _INDEX_RECORD {
110+ uint64_t bits[2];
111+ uint32_t offset;
112+} INDEX_RECORD;
113+
114+typedef struct _BIT_FILTER {
115+ uint64_t bits[2];
116+ int lambda;
117+} BIT_FILTER;
118+
119+/**********************************************************************/
120+
87121 /* assoc.c */
88122
89123 NODE *assoc_match_fn(NODE *);
@@ -90,6 +124,33 @@
90124
91125 /**********************************************************************/
92126
127+/* bitvec.c */
128+
129+#define MSEED_SIZE 13
130+
131+extern uint32_t magic_seed[MSEED_SIZE];
132+
133+void default_needle_bits_fn(NODE *,BIT_FILTER *);
134+void haystack_bits_fn(NODE *,uint64_t bits[2]);
135+
136+#if UINT64_MAX==UINT_MAX
137+# define BITVEC_MATCH(n,h) \
138+ ((n)->lambda<(__builtin_popcount((n)->bits[0]&(h)->bits[0]) \
139+ +__builtin_popcount((n)->bits[1]&(h)->bits[1])))
140+#else
141+# if UINT64_MAX==ULONG_MAX
142+# define BITVEC_MATCH(n,h) \
143+ ((n)->lambda<(__builtin_popcountl((n)->bits[0]&(h)->bits[0]) \
144+ +__builtin_popcountl((n)->bits[1]&(h)->bits[1])))
145+# else
146+# define BITVEC_MATCH(n,h) \
147+ ((n)->lambda<(__builtin_popcountll((n)->bits[0]&(h)->bits[0]) \
148+ +__builtin_popcountll((n)->bits[1]&(h)->bits[1])))
149+# endif
150+#endif
151+
152+/**********************************************************************/
153+
93154 /* cook.c */
94155
95156 extern int cook_output,canonicalize_input;
@@ -156,3 +217,35 @@
156217 void font_file_userpred(char *);
157218
158219 NODE *user_match_fn(NODE *);
220+
221+/**********************************************************************/
222+
223+/* Fowler-Noll-Vo hash, type 1a, on unsigned ints or uint32_ts,
224+ * whichever are bigger, but always folding to a uint32_t.
225+ * The "bump" argument, if nonzero, has the effect of hashing an
226+ * extra zero byte before the input; we use it to get two different
227+ * hash functions for (almost) the price of one, which is useful in
228+ * a few places in the code. */
229+
230+static inline uint32_t fnv_hash(int m,char *a,int bump) {
231+ int i;
232+#if UINT_MAX==0xFFFFFFFFFFFFFFFF
233+ unsigned int hval=bump?12638153115695167455U:14695981039346656037U;
234+
235+ for (i=0;i<m;i++) {
236+ hval^=a[i];
237+ hval*=1099511628211U;
238+ }
239+
240+ return (uint32_t)(hval^(hval>>32));
241+#else
242+ uint32_t hval=bump?UINT32_C(84696351):UINT32_C(2166136261);
243+
244+ for (i=0;i<m;i++) {
245+ hval^=a[i];
246+ hval*=UINT32_C(16777619);
247+ }
248+
249+ return hval;
250+#endif
251+}
--- trunk/idsgrep/bitvec.c (nonexistent)
+++ trunk/idsgrep/bitvec.c (revision 447)
@@ -0,0 +1,133 @@
1+/*
2+ * Bit vector matching for IDSgrep
3+ * Copyright (C) 2013 Matthew Skala
4+ *
5+ * This program is free software: you can redistribute it and/or modify
6+ * it under the terms of the GNU General Public License as published by
7+ * the Free Software Foundation, version 3.
8+ *
9+ * This program is distributed in the hope that it will be useful,
10+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+ * GNU General Public License for more details.
13+ *
14+ * You should have received a copy of the GNU General Public License
15+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
16+ *
17+ * Matthew Skala
18+ * http://ansuz.sooke.bc.ca/
19+ * mskala@ansuz.sooke.bc.ca
20+ */
21+
22+#include <stdio.h>
23+
24+#include "idsgrep.h"
25+
26+/**********************************************************************/
27+
28+/* Notes on bit vector layout:
29+ *
30+ * bits[0], least significant 32 bits: head and functor of root
31+ * bits[0], most significant 32 bits: head and functor of first/leftmost
32+ * child of root
33+ * bits[1], least signficant 32 bits: head and functor of last/rightmost
34+ * child of root (duplicates first/left if root is unary!)
35+ * bits[1], most significant 32 bits: head and functors of middle
36+ * child if any, and any grandchildren or deeper descendants
37+ */
38+
39+/**********************************************************************/
40+
41+/* "Hit your head on the corner of a block of tofu and die!" */
42+
43+uint32_t magic_seed[MSEED_SIZE]={
44+ UINT32_C(0xe8b186e8),UINT32_C(0x8590e381),UINT32_C(0xaee8a792),
45+ UINT32_C(0xe381abe9),UINT32_C(0xa0ade382),UINT32_C(0x92e381b6),
46+ UINT32_C(0xe381a4e3),UINT32_C(0x8191e381),UINT32_C(0xa6e6adbb),
47+ UINT32_C(0xe381ad21),
48+ (uint32_t)LAMBDA,
49+ (uint32_t)sizeof(INDEX_HEADER),(uint32_t)sizeof(INDEX_RECORD),
50+};
51+
52+/**********************************************************************/
53+
54+void default_needle_bits_fn(NODE *n,BIT_FILTER *f) {
55+ f->bits[0]=UINT64_MAX;
56+ f->bits[1]=UINT64_MAX;
57+ f->lambda=-1;
58+}
59+
60+/**********************************************************************/
61+
62+void haystack_bits_fn(NODE *n,uint64_t bits[2]) {
63+ int i;
64+ uint32_t hval;
65+ uint64_t cbits[2];
66+
67+ bits[0]=UINT64_C(0);
68+ bits[1]=UINT64_C(0);
69+
70+ /* head */
71+ if (n->head!=NULL) {
72+ hval=fnv_hash(n->head->length,n->head->data,0);
73+ for (i=0;i<=LAMBDA;i++) {
74+ bits[0]|=(UINT64_C(1)<<(hval&31));
75+ hval>>=5;
76+ }
77+ }
78+
79+ /* functor */
80+ hval=fnv_hash(n->functor->length,n->functor->data,1)+
81+ UINT32_C(1108378657)*n->arity;
82+ for (i=0;i<=LAMBDA;i++) {
83+ bits[0]|=(UINT64_C(1)<<(hval&31));
84+ hval>>=5;
85+ }
86+
87+ /* children */
88+ switch (n->arity) {
89+ case 0:
90+ /* DO NOTHING */
91+ break;
92+
93+ case 1:
94+ haystack_bits_fn(n->child[0],cbits);
95+ bits[0]|=(cbits[0]<<32);
96+ bits[1]|=cbits[0];
97+ bits[1]|=
98+ ((cbits[1]|(cbits[1]<<32))&UINT64_C(0xFFFFFFFF00000000));
99+ break;
100+
101+ case 2:
102+ haystack_bits_fn(n->child[0],cbits);
103+ bits[0]|=(cbits[0]<<32);
104+ bits[0]|=(cbits[0]&UINT64_C(0xFFFFFFFF00000000));
105+ bits[1]|=
106+ ((cbits[1]|(cbits[1]<<32))&UINT64_C(0xFFFFFFFF00000000));
107+
108+ haystack_bits_fn(n->child[1],cbits);
109+ bits[1]|=cbits[0];
110+ bits[1]|=
111+ ((cbits[1]|(cbits[1]<<32))&UINT64_C(0xFFFFFFFF00000000));
112+ break;
113+
114+ case 3:
115+ haystack_bits_fn(n->child[0],cbits);
116+ bits[0]|=(cbits[0]<<32);
117+ bits[0]|=(cbits[0]&UINT64_C(0xFFFFFFFF00000000));
118+ bits[1]|=
119+ ((cbits[1]|(cbits[1]<<32))&UINT64_C(0xFFFFFFFF00000000));
120+
121+ haystack_bits_fn(n->child[1],cbits);
122+ bits[1]|=
123+ ((cbits[0]|(cbits[0]<<32))&UINT64_C(0xFFFFFFFF00000000));
124+ bits[1]|=
125+ ((cbits[1]|(cbits[1]<<32))&UINT64_C(0xFFFFFFFF00000000));
126+
127+ haystack_bits_fn(n->child[2],cbits);
128+ bits[1]|=cbits[0];
129+ bits[1]|=
130+ ((cbits[1]|(cbits[1]<<32))&UINT64_C(0xFFFFFFFF00000000));
131+ break;
132+ }
133+}
--- trunk/idsgrep/configure.ac (revision 446)
+++ trunk/idsgrep/configure.ac (revision 447)
@@ -236,6 +236,7 @@
236236 #
237237 # Checks for typedefs, structures, and compiler characteristics.
238238 #
239+AC_C_INLINE
239240 AC_CHECK_SIZEOF([int])
240241 AC_TYPE_SIZE_T
241242 AX_C___ATTRIBUTE__
--- trunk/idsgrep/hash.c (revision 446)
+++ trunk/idsgrep/hash.c (revision 447)
@@ -49,7 +49,7 @@
4949 else
5050 for (i=5;(((size_t)1)<<i)<len;i++);
5151 if (free_strings[i]==NULL) {
52- rval=(HASHED_STRING *)malloc(sizeof(HASHED_STRING)+(((size_t)1)<<i));
52+ rval=(HASHED_STRING *)malloc(sizeof(HASHED_STRING)+(((size_t)1)<<i)+1);
5353 memset(rval,0,sizeof(HASHED_STRING));
5454 rval->data=(char *)(rval+1);
5555 } else {
@@ -62,6 +62,7 @@
6262 rval->length=len;
6363 rval->arity=-2;
6464 rval->match_fn=default_match_fn;
65+ rval->needle_bits_fn=default_needle_bits_fn;
6566 #ifdef HAVE_PCRE
6667 rval->pcre_compiled=NULL;
6768 rval->pcre_studied=NULL;
@@ -97,18 +98,8 @@
9798 static int hash_table_size=0;
9899 static int hash_table_occupancy=0;
99100
100-unsigned int hash_function(size_t len,char *s) {
101- unsigned int rval=1;
102- size_t i;
103-
104- for (i=0;i<len;i++) {
105- rval=(rval<<1)^rval^(rval>>1)^(unsigned int)(s[i]);
106- }
107- return rval;
108-}
109-
110101 HASHED_STRING *new_string(size_t len,char *s) {
111- unsigned int h,tmph;
102+ uint32_t h,tmph;
112103 int i;
113104 HASHED_STRING *tmps;
114105 HASHED_STRING **new_table;
@@ -122,7 +113,7 @@
122113 }
123114
124115 /* search for the string in the hash table */
125- h=hash_function(len,s);
116+ h=fnv_hash(len,s,0);
126117 for (tmps=hash_table[h%hash_table_size];
127118 tmps && ((tmps->length!=len) || memcmp(tmps->data,s,len));
128119 tmps=tmps->next);
@@ -145,7 +136,7 @@
145136 while (hash_table[i]) {
146137 tmps=hash_table[i];
147138 hash_table[i]=tmps->next;
148- tmph=hash_function(tmps->length,tmps->data)%hash_table_size;
139+ tmph=fnv_hash(tmps->length,tmps->data,0)%hash_table_size;
149140 tmps->next=new_table[tmph];
150141 new_table[tmph]=tmps;
151142 }
@@ -176,7 +167,7 @@
176167
177168 s->refs--;
178169 if (s->refs==0) {
179- h=hash_function(s->length,s->data)%hash_table_size;
170+ h=fnv_hash(s->length,s->data,0)%hash_table_size;
180171 if (hash_table[h]==s) {
181172 hash_table[h]=s->next;
182173 } else {
--- trunk/idsgrep/Makefile.am (revision 446)
+++ trunk/idsgrep/Makefile.am (revision 447)
@@ -153,8 +153,8 @@
153153
154154 AM_CFLAGS := $(MAYBE_COVERAGE) $(PCRE_CFLAGS) $(AM_CFLAGS)
155155 idsgrep_SOURCES = \
156- assoc.c cook.c hash.c idsgrep.c idsgrep.h match.c parse.c \
157- regex.c unilist.c userpred.c
156+ assoc.c bitvec.c cook.c hash.c idsgrep.c idsgrep.h match.c \
157+ parse.c regex.c unilist.c userpred.c
158158
159159 LDADD = @LIBOBJS@ $(PCRE_LIBS)
160160
--- trunk/idsgrep/idsgrep.c (revision 446)
+++ trunk/idsgrep/idsgrep.c (revision 447)
@@ -30,6 +30,9 @@
3030
3131 /**********************************************************************/
3232
33+static int generate_index=0,ignore_indices=0;
34+
35+/* basic search and index generation */
3336 void process_file(NODE *match_pattern,char *fn,int fn_flag) {
3437 int read_amt,flag,i;
3538 NODE *to_match;
@@ -37,14 +40,25 @@
3740 int inbuf_size=0,inbuf_used=0,parse_ptr=0;
3841 FILE *infile;
3942 HASHED_STRING *hfn,*colon;
43+ off_t offset=0;
44+ INDEX_HEADER ih;
45+ INDEX_RECORD ir;
4046
41- /* wrap the filename in a string so we can escape-print it */
42- if (fn_flag>=0)
43- hfn=new_string(strlen(fn+fn_flag),fn+fn_flag);
44- else
45- hfn=new_string(strlen(fn),fn);
46- colon=new_string(1,":");
47-
47+ if (generate_index) {
48+ /* write header */
49+ ih.magica=fnv_hash(MSEED_SIZE*sizeof(uint32_t),(char *)magic_seed,0);
50+ ih.despell=fnv_hash(MSEED_SIZE*sizeof(uint32_t),(char *)magic_seed,1);
51+ fwrite(&ih,sizeof(INDEX_HEADER),1,stdout);
52+
53+ } else {
54+ /* wrap the filename in a string so we can escape-print it */
55+ if (fn_flag>=0)
56+ hfn=new_string(strlen(fn+fn_flag),fn+fn_flag);
57+ else
58+ hfn=new_string(strlen(fn),fn);
59+ colon=new_string(1,":");
60+ }
61+
4862 /* open input file */
4963 if (strcmp(fn,"-")) {
5064 infile=fopen(fn,"rb");
@@ -94,7 +108,16 @@
94108 if (parse_state==PS_COMPLETE_TREE) {
95109 to_match=parse_stack[0];
96110 stack_ptr=0;
97- if (tree_match(match_pattern,to_match)) {
111+
112+ if (generate_index) {
113+ for (i=0;((unsigned char)input_buffer[i])<=0x20;i++)
114+ offset++;
115+ ir.offset=offset;
116+ haystack_bits_fn(to_match,ir.bits);
117+ fwrite(&ir,sizeof(INDEX_RECORD),1,stdout);
118+ offset+=(parse_ptr-i);
119+
120+ } else if (tree_match(match_pattern,to_match)) {
98121 for (i=0;((unsigned char)input_buffer[i])<=0x20;i++);
99122 if (fn_flag>=0)
100123 write_bracketed_string(hfn,colon);
@@ -105,6 +128,7 @@
105128 echoing_whitespace=1;
106129 }
107130 }
131+
108132 free_node(to_match);
109133 if (parse_ptr<inbuf_used)
110134 memmove(input_buffer,input_buffer+parse_ptr,
@@ -120,18 +144,41 @@
120144 fclose(infile);
121145
122146 free(input_buffer);
123- delete_string(hfn);
124- delete_string(colon);
147+
148+ if (generate_index) {
149+ offset+=inbuf_used;
150+ ir.offset=offset;
151+ ir.bits[0]=UINT64_C(0);
152+ ir.bits[1]=UINT64_C(0);
153+ fwrite(&ir,sizeof(INDEX_RECORD),1,stdout);
154+ } else {
155+ delete_string(hfn);
156+ delete_string(colon);
157+ }
125158 }
126159
127160 /**********************************************************************/
128161
162+void process_file_indexed(NODE *match_pattern,char *fn,int fn_flag) {
163+ if (ignore_indices || generate_index || (strcmp(fn,"-")==0)) {
164+ process_file(match_pattern,fn,fn_flag);
165+ return;
166+ }
167+
168+ /* FIXME - do search with index */
169+ process_file(match_pattern,fn,fn_flag);
170+}
171+
172+/**********************************************************************/
173+
129174 static struct option long_opts[] = {
130175 {"cooking",required_argument,NULL,'c'},
131176 {"dictionary",optional_argument,NULL,'d'},
132177 {"font-chars",required_argument,NULL,'f'},
133178 {"help",no_argument,NULL,'h'},
134- {"unicode-list",no_argument,NULL,'U'},
179+ {"generate-index",no_argument,NULL,'G'},
180+ {"ignore-indices",no_argument,NULL,'I'},
181+ {"unicode-list",optional_argument,NULL,'U'},
135182 {"version",no_argument,NULL,'V'},
136183 {0,0,0,0},
137184 };
@@ -158,9 +205,17 @@
158205 register_syntax();
159206
160207 /* loop on command-line options */
161- while ((c=getopt_long(argc,argv,"U::Vc:d::f:h",long_opts,NULL))!=-1) {
208+ while ((c=getopt_long(argc,argv,"GIU::Vc:d::f:h",long_opts,NULL))!=-1) {
162209 switch (c) {
210+
211+ case 'G':
212+ generate_index=1;
213+ break;
163214
215+ case 'I':
216+ ignore_indices=1;
217+ break;
218+
164219 case 'U':
165220 generate_list=1;
166221 unilist_cfg=optarg;
@@ -206,6 +261,8 @@
206261 puts("Usage: " PACKAGE_TARNAME " [OPTION]... PATTERN [FILE]...\n"
207262 "PATTERN should be an Extended Ideographic Description Sequence\n\n"
208263 "Options:\n"
264+ " -G, --generate-index generate bit vector index\n"
265+ " -I, --ignore-indices don't use bit vector indices\n"
209266 " -U, --unicode-list=CFG generate Unicode list\n"
210267 " -V, --version display version and license\n"
211268 " -c, --cooking=FMT set input/output cooking\n"
@@ -247,8 +304,8 @@
247304 if (glob(dictglob,0,NULL,&globres)==0) {
248305 num_files+=globres.gl_pathc;
249306 for (c=0;c<globres.gl_pathc;c++)
250- process_file(match_pattern,globres.gl_pathv[c],
251- num_files>1?strlen(dictdir)+1:-1);
307+ process_file_indexed(match_pattern,globres.gl_pathv[c],
308+ num_files>1?strlen(dictdir)+1:-1);
252309 globfree(&globres);
253310 }
254311 free(dictglob);
@@ -256,7 +313,7 @@
256313
257314 /* loop on explicit filenames */
258315 while (optind<argc)
259- process_file(match_pattern,argv[optind++],num_files>1?0:-1);
316+ process_file_indexed(match_pattern,argv[optind++],num_files>1?0:-1);
260317
261318 /* read stdin or complain */
262319 if ((num_files==0) && (generate_list==0)) {
@@ -265,6 +322,6 @@
265322 else
266323 puts("(no dictionaries were searched)");
267324 }
268-
325+
269326 exit(0);
270327 }
Show on old repository browser