• R/O
  • SSH
  • HTTPS

tsukurimashou: Commit


Commit MetaInfo

Revision300 (tree)
Time2012-07-27 04:05:39
Authormskala

Log Message

regular expression matching

Change Summary

Incremental Difference

--- trunk/idsgrep/m4/tsu_c_anon_union_struct.m4 (revision 299)
+++ trunk/idsgrep/m4/tsu_c_anon_union_struct.m4 (revision 300)
@@ -33,5 +33,5 @@
3333 [tsu_cv_c_anon_union_struct=yes],[tsu_cv_c_anon_union_struct=no])])
3434 AS_IF([test "x$tsu_cv_c_anon_union_struct" = xyes],
3535 [AC_DEFINE([ANON_UNION_STRUCT],[1],
36- [if anonymous unions and structs can be members])])
36+ [Define to 1 if anonymous unions and structs can be members])])
3737 ])
--- trunk/idsgrep/idsgrep.h (revision 299)
+++ trunk/idsgrep/idsgrep.h (revision 300)
@@ -21,6 +21,10 @@
2121
2222 #include "config.h"
2323
24+#ifdef HAVE_PCRE
25+#include <pcre.h>
26+#endif
27+
2428 /**********************************************************************/
2529
2630 typedef struct _NODE *(*MATCH_FN)(struct _NODE *);
@@ -40,6 +44,10 @@
4044 size_t length;
4145 int refs,arity;
4246 MATCH_FN match_fn;
47+#ifdef HAVE_PCRE
48+ pcre *pcre_compiled;
49+ pcre_extra *pcre_studied;
50+#endif
4351 } HASHED_STRING;
4452
4553 typedef struct _NODE {
@@ -98,7 +106,6 @@
98106 NODE *anywhere_match_fn(NODE *);
99107 NODE *equal_match_fn(NODE *);
100108 NODE *not_match_fn(NODE *);
101-NODE *regex_match_fn(NODE *);
102109 NODE *unord_match_fn(NODE *);
103110
104111 int tree_match(NODE *,NODE *);
@@ -114,3 +121,9 @@
114121
115122 size_t parse(size_t,char *);
116123 void register_syntax(void);
124+
125+/**********************************************************************/
126+
127+/* regex.c */
128+
129+NODE *regex_match_fn(NODE *);
--- trunk/idsgrep/regex.c (nonexistent)
+++ trunk/idsgrep/regex.c (revision 300)
@@ -0,0 +1,128 @@
1+/*
2+ * Regular expression match for IDSgrep
3+ * Copyright (C) 2012 Matthew Skala
4+ *
5+ * This program is free software: you can redistribute it and/or modify
6+ * it under the terms of the GNU General Public License as published by
7+ * the Free Software Foundation, version 3.
8+ *
9+ * This program is distributed in the hope that it will be useful,
10+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+ * GNU General Public License for more details.
13+ *
14+ * You should have received a copy of the GNU General Public License
15+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
16+ *
17+ * Matthew Skala
18+ * http://ansuz.sooke.bc.ca/
19+ * mskala@ansuz.sooke.bc.ca
20+ */
21+
22+#include <stdio.h>
23+#include <stdlib.h>
24+#include <string.h>
25+
26+#include "idsgrep.h"
27+
28+#ifdef HAVE_PCRE
29+
30+/**********************************************************************/
31+
32+NODE *regex_match_fn(NODE *ms) {
33+ const char *errptr;
34+ int erroffset,i;
35+ NODE *rval,*tmpn;
36+
37+ if ((ms->nc_needle->child[0]->head!=NULL) &&
38+ (ms->nc_haystack->head!=NULL)) {
39+ if (ms->nc_needle->child[0]->head->pcre_compiled==NULL) {
40+ ms->nc_needle->child[0]->head->pcre_compiled=
41+ pcre_compile(ms->nc_needle->child[0]->head->data,
42+ PCRE_UTF8|PCRE_NO_UTF8_CHECK,
43+ &errptr,&erroffset,NULL);
44+ if (ms->nc_needle->child[0]->head->pcre_compiled==NULL) {
45+ printf("PCRE error: %s at %d.\n",errptr,erroffset);
46+ exit(1);
47+ }
48+ ms->nc_needle->child[0]->head->pcre_studied=
49+ pcre_study(ms->nc_needle->child[0]->head->pcre_compiled,
50+ 0,&errptr);
51+ if (errptr) {
52+ printf("PCRE error: %s.\n",errptr);
53+ exit(1);
54+ }
55+ }
56+ ms->match_result=
57+ (pcre_exec(ms->nc_needle->child[0]->head->pcre_compiled,
58+ ms->nc_needle->child[0]->head->pcre_studied,
59+ ms->nc_haystack->head->data,
60+ ms->nc_haystack->head->length,
61+ 0,PCRE_NO_UTF8_CHECK,NULL,0)>=0)?
62+ MR_TRUE:MR_FALSE;
63+ return ms;
64+ }
65+
66+ if (ms->nc_needle->child[0]->arity!=ms->nc_haystack->arity) {
67+ ms->match_result=MR_FALSE;
68+ return ms;
69+ }
70+
71+ if (ms->nc_needle->child[0]->functor->pcre_compiled==NULL) {
72+ ms->nc_needle->child[0]->functor->pcre_compiled=
73+ pcre_compile(ms->nc_needle->child[0]->functor->data,
74+ PCRE_UTF8|PCRE_NO_UTF8_CHECK,
75+ &errptr,&erroffset,NULL);
76+ if (ms->nc_needle->child[0]->functor->pcre_compiled==NULL) {
77+ printf("PCRE error: %s at %d.\n",errptr,erroffset);
78+ exit(1);
79+ }
80+ ms->nc_needle->child[0]->functor->pcre_studied=
81+ pcre_study(ms->nc_needle->child[0]->functor->pcre_compiled,
82+ 0,&errptr);
83+ if (errptr) {
84+ printf("PCRE error: %s.\n",errptr);
85+ exit(1);
86+ }
87+ if (pcre_exec(ms->nc_needle->child[0]->functor->pcre_compiled,
88+ ms->nc_needle->child[0]->functor->pcre_studied,
89+ ms->nc_haystack->head->data,
90+ ms->nc_haystack->head->length,
91+ 0,PCRE_NO_UTF8_CHECK,NULL,0)<0) {
92+ ms->match_result=MR_FALSE;
93+ return ms;
94+ }
95+ }
96+
97+ if (ms->nc_needle->child[0]->arity==0) {
98+ ms->match_result=MR_TRUE;
99+ return ms;
100+ }
101+
102+ rval=ms;
103+ ms->match_result=MR_AND_MAYBE;
104+
105+ for (i=0;i<ms->nc_needle->child[0]->arity;i++) {
106+ tmpn=new_node();
107+ tmpn->nc_next=rval;
108+ tmpn->nc_next->refs++;
109+ rval=tmpn;
110+ rval->nc_needle=ms->nc_needle->child[0]->child[i];
111+ rval->nc_needle->refs++;
112+ rval->nc_haystack=ms->nc_haystack->child[i];
113+ rval->nc_haystack->refs++;
114+ rval->match_parent=ms;
115+ }
116+ return rval;
117+}
118+
119+/**********************************************************************/
120+
121+#else
122+
123+NODE *regex_match_fn(NODE *ms) {
124+ puts("compiled without regular expression support");
125+ exit(1);
126+}
127+
128+#endif
--- trunk/idsgrep/idsgrep.tex (revision 299)
+++ trunk/idsgrep/idsgrep.tex (revision 300)
@@ -1323,16 +1323,80 @@
13231323
13241324 \subsection{Regular expression matching}
13251325
1326-It is planned that some future version (likely version 0.3) will support
1327-special behaviour for $\textit{match}'(\texttt{./.}x,y)$ to call a regular
1328-expression library and do string matching within heads or functors, but
1329-the detailed semantics of how that will work are not yet decided. The
1330-mnemonic is that slash is Perl's regular-expression match operator; the
1331-motivating application is to further development of IDSgrep's own
1332-dictionary-generating programs, which tend to create long nullary functors
1333-full of debugging information when they encounter constructs they don't
1334-understand in the other-format dictionaries they read.
1326+If $x$ and $y$ both have heads, then $\textit{match}'(\texttt{./.}x,y)$
1327+is true if and only if the head of $x$, considered as a regular
1328+expression, matches the head of $y$. If $x$ and $y$ do not both have
1329+heads, then $\textit{match}'(\texttt{./.}x,y)$ is true if and only if
1330+$x$ and $y$ have the same arity, the functor of $x$ considered as a
1331+regular expression matches the functor of $y$, and
1332+$\textit{match}(x_i,y_i)$ is true for each of their corresponding
1333+children. This operation is basically the same as the default
1334+matching operation, except that regular expression matching is used
1335+instead of strict equality for testing the heads and functors.
1336+Mnemonic: slash means regular expression matching in Perl.
1337+Verbose ASCII name: ``\texttt{.regex.}.''
13351338
1339+Regular expression matching for the purposes of this operator is as
1340+defined by the Perl Compatible Regular Expressions library, in
1341+whichever version was linked with the \texttt{idsgrep} utility.
1342+Strings are passed into PCRE as UTF-8, and are guaranteed (because the
1343+EIDS parser decodes and re-encodes \texttt{idsgrep}'s input for
1344+internal use) to be valid UTF-8 when PCRE sees them regardless of user
1345+input; as such, PCRE is given the option flags that make it read UTF-8
1346+without doing its own validity check. Use of the PCRE
1347+``\texttt{\textbackslash C}'' syntax for matching individual octets
1348+within UTF-8 is strongly not recommended. All other PCRE options are
1349+left to the defaults chosen when PCRE was compiled, even if those are
1350+silly. The character tables are PCRE's ``C locale'' defaults, not
1351+generated at runtime from the current locale. Things like case
1352+sensitivity can be controlled within the pattern using PCRE's syntax
1353+for doing so. In the event that \texttt{idsgrep} was compiled without
1354+the PCRE library (which is not recommended, but is possible), or that
1355+PCRE was compiled without UTF-8 support, then an attempt to evaluate
1356+the slash operator will trigger a fatal error.
1357+
1358+A matching pattern given to PCRE will have already passed through the
1359+EIDS parser, which removes one level of backslash escaping. The
1360+pattern may also have been passed as a command-line argument to
1361+\texttt{idsgrep} by a shell, which may have undone another level of
1362+backslash escaping. Thus, it may be necessary to escape characters as
1363+many as three times in order to match them literally with the slash
1364+operator. Each of these levels may differ from the others in terms of
1365+the escape sequences it supports and their exact meanings. In many
1366+cases it doesn't really matter which level of processing
1367+evaluates the escaping. For instance, ``\texttt{idsgrep
1368+"/(\textbackslash t)"},'' (shell
1369+evaluates ``\texttt{\textbackslash t},'' EIDS and PCRE see a literal tab);
1370+``\texttt{idsgrep "/(\textbackslash\textbackslash
1371+t)"},'' (shell removes one backslash, EIDS evaluates
1372+``\texttt{\textbackslash t},'' PCRE sees a literal tab); and ``\texttt{idsgrep
1373+"/(\textbackslash\textbackslash\textbackslash\textbackslash
1374+t)"},'' (shell removes two backslashes, EIDS removes
1375+one, PCRE evaluates ``\texttt{\textbackslash t}'') will all match the same
1376+things. If it matters, however, then caution is necessary.
1377+
1378+PCRE because of the limitations of its API effectively forbids zero
1379+bytes (U+0000) in its matching patterns, whereas EIDS allows them to
1380+exist within strings in general. The complexities of PCRE pattern
1381+syntax make it impractical for \texttt{idsgrep} to automatically
1382+escape zero bytes before passing the strings to PCRE; there are too
1383+many different cases possible for the context in which a zero byte
1384+might occur. Note that Python, which like EIDS allows strings to
1385+contain zero bytes and so faces the same issue, briefly attempted to
1386+work around this PCRE API limitation by auto-escaping, and they
1387+eventually gave it up as too complicated and confusing. As a result
1388+of PCRE's API, if the string given as a matching pattern contains a
1389+literal zero byte, then the regular expression to be matched will
1390+consist of the prefix of the string up to but not including the first
1391+zero byte; anything after that will be ignored. Zero bytes are,
1392+nonetheless, permitted in the matching subject, and PCRE can search
1393+for them, provided the pattern does not actually contain literal zero
1394+bytes. For instance, the PCRE syntax ``\texttt{\textbackslash000}''
1395+(or just ``\texttt{\textbackslash0}'' if you can be sure the next
1396+character will not be an octal digit) will match a zero byte. As
1397+discussed above, additional escaping might be needed to ensure that
1398+PCRE, and not EIDS nor the shell, interprets the backslash escape.
1399+
13361400 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
13371401 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
13381402 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- trunk/idsgrep/configure.ac (revision 299)
+++ trunk/idsgrep/configure.ac (revision 300)
@@ -196,7 +196,15 @@
196196 # Checks for libraries.
197197 #
198198 AX_PERL_MODULE_VERSION([XML::Parser 2.36],[],
199- [AC_MSG_ERROR([Required Perl modules are missing])])
199+ [AC_MSG_WARN([XML::Parser, needed for KanjiVG dictionary, is missing])])
200+PKG_CHECK_MODULES([PCRE],[libpcre],[
201+ have_pcre=yes
202+ AC_DEFINE([HAVE_PCRE],[1],
203+ [Define to 1 if Perl Compatible Regular Expressions library is available])
204+ ],[
205+ have_pcre=no
206+ AC_MSG_WARN([PCRE is missing, regular expression search won't be built])])
207+AM_CONDITIONAL([COND_PCRE],[test '!' "$have_pcre" = no])
200208 #
201209 ############################################################################
202210 #
--- trunk/idsgrep/hash.c (revision 299)
+++ trunk/idsgrep/hash.c (revision 300)
@@ -53,7 +53,7 @@
5353 HASHED_STRING *rval;
5454 char *save_data;
5555
56- while ((((size_t)1)<<i)<len) i++;
56+ while ((((size_t)1)<<i)<=len) i++;
5757 if (free_strings[i]==NULL) {
5858 rval=(HASHED_STRING *)malloc(sizeof(HASHED_STRING));
5959 memset(rval,0,sizeof(HASHED_STRING));
@@ -68,6 +68,8 @@
6868 rval->length=len;
6969 rval->arity=-2;
7070 rval->match_fn=default_match_fn;
71+ rval->pcre_compiled=NULL;
72+ rval->pcre_studied=NULL;
7173 return rval;
7274 }
7375
@@ -74,8 +76,16 @@
7476 void free_string(HASHED_STRING *s) {
7577 int i=1;
7678
77- while ((((size_t)1)<<i)<s->length) i++;
79+ while ((((size_t)1)<<i)<=s->length) i++;
7880 s->next=free_strings[i];
81+ if (s->pcre_compiled) {
82+ free(s->pcre_compiled);
83+ s->pcre_compiled=NULL;
84+ }
85+ if (s->pcre_studied) {
86+ free(s->pcre_studied);
87+ s->pcre_studied=NULL;
88+ }
7989 free_strings[i]=s;
8090 }
8191
@@ -144,6 +154,7 @@
144154 /* actually add the new entry */
145155 tmps=alloc_string(len);
146156 memcpy(tmps->data,s,len);
157+ tmps->data[len]='\0';
147158 tmps->length=len;
148159 tmps->refs=1;
149160 tmps->next=hash_table[h%hash_table_size];
--- trunk/idsgrep/Makefile.am (revision 299)
+++ trunk/idsgrep/Makefile.am (revision 300)
@@ -66,10 +66,11 @@
6666
6767 EXTRA_DIST = PKGBUILD.in gnugetopt.h idsgrep.tex idsgrep.bib
6868
69-AM_CFLAGS := $(MAYBE_COVERAGE) $(AM_CFLAGS)
70-idsgrep_SOURCES = assoc.c hash.c idsgrep.c idsgrep.h match.c parse.c
69+AM_CFLAGS := $(MAYBE_COVERAGE) $(PCRE_CFLAGS) $(AM_CFLAGS)
70+idsgrep_SOURCES = \
71+ assoc.c hash.c idsgrep.c idsgrep.h match.c parse.c regex.c
7172
72-LDADD = @LIBOBJS@
73+LDADD = @LIBOBJS@ $(PCRE_LIBS)
7374
7475 man1_MANS = idsgrep.1
7576
--- trunk/idsgrep/match.c (revision 299)
+++ trunk/idsgrep/match.c (revision 300)
@@ -158,8 +158,10 @@
158158 NODE *rval,*tmpn;
159159 int i;
160160
161- if ((ms->nc_needle->child[0]->head!=NULL) && (ms->nc_haystack->head!=NULL)) {
162- ms->match_result=(ms->nc_needle->child[0]->head==ms->nc_haystack->head)?
161+ if ((ms->nc_needle->child[0]->head!=NULL) &&
162+ (ms->nc_haystack->head!=NULL)) {
163+ ms->match_result=
164+ (ms->nc_needle->child[0]->head==ms->nc_haystack->head)?
163165 MR_TRUE:MR_FALSE;
164166 return ms;
165167 }
@@ -363,14 +365,6 @@
363365
364366 /**********************************************************************/
365367
366-NODE *regex_match_fn(NODE *ms) { /* SNH */
367- /* FIXME */
368- ms->match_result=MR_TRUE; /* SNH */
369- return ms; /* SNH */
370-}
371-
372-/**********************************************************************/
373-
374368 int tree_match(NODE *needle,NODE *haystack) {
375369 NODE *mn,*tmpn,*tmpnn;
376370
Show on old repository browser