• R/O
  • SSH
  • HTTPS

tsukurimashou: Commit


Commit MetaInfo

Revision455 (tree)
Time2013-08-07 22:50:21
Authormskala

Log Message

tree match memoization, promote library checks to main build

Change Summary

Incremental Difference

--- trunk/m4/ax_perl_module_version.m4 (nonexistent)
+++ trunk/m4/ax_perl_module_version.m4 (revision 455)
@@ -0,0 +1,85 @@
1+# ===========================================================================
2+# http://www.gnu.org/software/autoconf-archive/ax_perl_module_version.html
3+# ===========================================================================
4+#
5+# SYNOPSIS
6+#
7+# AX_PERL_MODULE_VERSION([MODULE VERSION], [ACTION-IF-TRUE], [ACTION-IF-FALSE])
8+#
9+# DESCRIPTION
10+#
11+# Checks to see if the list of 'Module Version' are avaiable in the
12+# system. If all the modules in the list are avaiable ACTION-IF-TRUE is
13+# executed. Case one module is not avaiable ACTION-IF-FALSE is executed
14+# and the macro execution is aborted. NOTE: Perl is needed.
15+#
16+# Example:
17+#
18+# AX_PERL_MODULE_VERSION(CGI::Test 0.104 CGI::Ajax 0.694, ,
19+# AC_MSG_ERROR(Need some Perl modules))
20+#
21+# LICENSE
22+#
23+# Copyright (c) 2009 Marco Gomes <mpglesi@gmail.com>
24+# Copyright (c) 2009 Ruben Fonseca <fonseka@gmail.com>
25+#
26+# Copying and distribution of this file, with or without modification, are
27+# permitted in any medium without royalty provided the copyright notice
28+# and this notice are preserved. This file is offered as-is, without any
29+# warranty.
30+
31+#serial 5
32+
33+AU_ALIAS([AC_PERL_MODULE_VERSION], [AX_PERL_MODULE_VERSION])
34+AC_DEFUN([AX_PERL_MODULE_VERSION],[dnl
35+ac_perl_list_modules="$1"
36+# Make sure we have perl
37+if test -z "$PERL"; then
38+AC_CHECK_PROG(PERL,perl,perl)
39+fi
40+
41+# Check the number of arguments
42+args_num=`echo $ac_perl_list_modules | wc -w`
43+let "ckeck_args = $args_num % 2"
44+if test "$check_args" = "1" ; then
45+ AC_MSG_ERROR(syntax error)
46+else
47+ eval
48+fi
49+
50+if test "x$PERL" != x; then
51+ ac_failed=0
52+ while test ${#ac_perl_list_modules} -gt 2 ; do
53+ module_name=`echo $ac_perl_list_modules | cut -d " " -f 1`
54+ module_version=`echo $ac_perl_list_modules | cut -d " " -f 2`
55+ ac_perl_list_modules=`echo $ac_perl_list_modules | cut -d " " -f 3-`
56+ AC_MSG_CHECKING(for perl module $module_name version $module_version)
57+
58+ $PERL "-M$module_name" -e exit > /dev/null 2>&1
59+ if test $? -ne 0; then
60+ AC_MSG_RESULT(no);
61+ ac_failed=1
62+ ac_perl_list_modules=""
63+ else
64+ version=`$PERL "-M$module_name" -e 'print $'"$module_name::VERSION" 2>&1`
65+ $PERL -e 'exit(shift cmp shift)' "$version" "$module_version"
66+ if test $? -eq 0 -o $? -eq 1 ; then
67+ AC_MSG_RESULT(ok);
68+ else
69+ AC_MSG_RESULT(no)
70+ ac_failed=1
71+ ac_perl_list_modules=""
72+ fi
73+ fi;
74+ done
75+
76+ if test "$ac_failed" = 0; then
77+ :
78+ $2
79+ else
80+ :
81+ $3
82+ fi
83+else
84+ AC_MSG_ERROR(could not find perl)
85+fi])dnl
--- trunk/idsgrep/m4/ax_perl_module_version.m4 (revision 454)
+++ trunk/idsgrep/m4/ax_perl_module_version.m4 (nonexistent)
@@ -1,85 +0,0 @@
1-# ===========================================================================
2-# http://www.gnu.org/software/autoconf-archive/ax_perl_module_version.html
3-# ===========================================================================
4-#
5-# SYNOPSIS
6-#
7-# AX_PERL_MODULE_VERSION([MODULE VERSION], [ACTION-IF-TRUE], [ACTION-IF-FALSE])
8-#
9-# DESCRIPTION
10-#
11-# Checks to see if the list of 'Module Version' are avaiable in the
12-# system. If all the modules in the list are avaiable ACTION-IF-TRUE is
13-# executed. Case one module is not avaiable ACTION-IF-FALSE is executed
14-# and the macro execution is aborted. NOTE: Perl is needed.
15-#
16-# Example:
17-#
18-# AX_PERL_MODULE_VERSION(CGI::Test 0.104 CGI::Ajax 0.694, ,
19-# AC_MSG_ERROR(Need some Perl modules))
20-#
21-# LICENSE
22-#
23-# Copyright (c) 2009 Marco Gomes <mpglesi@gmail.com>
24-# Copyright (c) 2009 Ruben Fonseca <fonseka@gmail.com>
25-#
26-# Copying and distribution of this file, with or without modification, are
27-# permitted in any medium without royalty provided the copyright notice
28-# and this notice are preserved. This file is offered as-is, without any
29-# warranty.
30-
31-#serial 5
32-
33-AU_ALIAS([AC_PERL_MODULE_VERSION], [AX_PERL_MODULE_VERSION])
34-AC_DEFUN([AX_PERL_MODULE_VERSION],[dnl
35-ac_perl_list_modules="$1"
36-# Make sure we have perl
37-if test -z "$PERL"; then
38-AC_CHECK_PROG(PERL,perl,perl)
39-fi
40-
41-# Check the number of arguments
42-args_num=`echo $ac_perl_list_modules | wc -w`
43-let "ckeck_args = $args_num % 2"
44-if test "$check_args" = "1" ; then
45- AC_MSG_ERROR(syntax error)
46-else
47- eval
48-fi
49-
50-if test "x$PERL" != x; then
51- ac_failed=0
52- while test ${#ac_perl_list_modules} -gt 2 ; do
53- module_name=`echo $ac_perl_list_modules | cut -d " " -f 1`
54- module_version=`echo $ac_perl_list_modules | cut -d " " -f 2`
55- ac_perl_list_modules=`echo $ac_perl_list_modules | cut -d " " -f 3-`
56- AC_MSG_CHECKING(for perl module $module_name version $module_version)
57-
58- $PERL "-M$module_name" -e exit > /dev/null 2>&1
59- if test $? -ne 0; then
60- AC_MSG_RESULT(no);
61- ac_failed=1
62- ac_perl_list_modules=""
63- else
64- version=`$PERL "-M$module_name" -e 'print $'"$module_name::VERSION" 2>&1`
65- $PERL -e 'exit(shift cmp shift)' "$version" "$module_version"
66- if test $? -eq 0 -o $? -eq 1 ; then
67- AC_MSG_RESULT(ok);
68- else
69- AC_MSG_RESULT(no)
70- ac_failed=1
71- ac_perl_list_modules=""
72- fi
73- fi;
74- done
75-
76- if test "$ac_failed" = 0; then
77- :
78- $2
79- else
80- :
81- $3
82- fi
83-else
84- AC_MSG_ERROR(could not find perl)
85-fi])dnl
--- trunk/idsgrep/idsgrep.h (revision 454)
+++ trunk/idsgrep/idsgrep.h (revision 455)
@@ -131,7 +131,7 @@
131131
132132 typedef struct _INDEX_RECORD {
133133 uint64_t bits[2];
134- uint32_t offset;
134+ off_t offset;
135135 } INDEX_RECORD;
136136
137137 typedef struct _BIT_FILTER {
@@ -235,6 +235,9 @@
235235 NODE *not_match_fn(NODE *);
236236 NODE *unord_match_fn(NODE *);
237237
238+extern uint64_t memo_checks,memo_hits;
239+
240+void check_memoization(void);
238241 int tree_match(NODE *,NODE *);
239242
240243 /**********************************************************************/
--- trunk/idsgrep/parse.c (revision 454)
+++ trunk/idsgrep/parse.c (revision 455)
@@ -572,9 +572,7 @@
572572
573573 oph->arity=arity;
574574 oph->mate=clh;
575- oph->refs++;
576575 clh->mate=oph;
577- if (clh!=oph) clh->refs++;
578576 hashed_bracket[idx]=oph;
579577 }
580578
--- trunk/idsgrep/idsgrep.tex (revision 454)
+++ trunk/idsgrep/idsgrep.tex (revision 455)
@@ -109,6 +109,8 @@
109109 \renewcommand{\thefootnote}{\fnsymbol{footnote}}
110110
111111 \newcommand{\DangerousBend}{\marginpar{\large\hfill\dbend\hfill\null}}
112+\newcommand{\DangerousSection}{\marginpar{\large\hfill
113+\raisebox{-0.5\baselineskip}[0pt][0pt]{\dbend}\hfill\null}}
112114
113115 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
114116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -815,9 +817,9 @@
815817
816818 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
817819
818-\section{A note on TrueType/OpenType}
820+\section{A note on TrueType/OpenType}\DangerousBend
819821
820-This \DangerousBend version of IDSgrep is designed to read TrueType or
822+This version of IDSgrep is designed to read TrueType or
821823 OpenType files (the distinction between the two is not relevant at this
822824 level) for character map information. The specification for the
823825 TrueType/OpenType file format reads like a parody. I'd like to take a
@@ -1957,17 +1959,20 @@
19571959
19581960 \noindent
19591961
1960-Matching trees according to the rules in the previous chapter is potentially
1961-an expensive operation. The cases of tree-matching actually used in
1962-practice tend to be relatively easy ones, but parsing EIDS syntax is also
1963-expensive enough, even with an optimized implementation, that it may take as
1964-much time as matching or more; and every tree of the database must be parsed
1965-in order to attempt a match. A complete installation of IDSgrep may take a
1966-second or two on a typical PC to parse and search all the dictionaries,
1967-which is not long enough to be a problem in typical interactive use, but
1968-could become an issue if queries were being generated automatically, faster
1969-than a single user would type them.
1962+Executive summary: if you leave it on the default settings and skip this
1963+chapter, it should just work.
19701964
1965+In more detail: matching trees according to the rules in the previous
1966+chapter is potentially an expensive operation. The cases of tree-matching
1967+actually used in practice tend to be relatively easy ones, but parsing EIDS
1968+syntax is also expensive enough, even with an optimized implementation, that
1969+it may take as much time as matching or more; and every tree of the database
1970+must be parsed in order to attempt a match. A complete installation of
1971+IDSgrep may take a second or two on a typical PC to parse and search all the
1972+dictionaries, which is not long enough to be a problem in typical
1973+interactive use, but could become an issue if queries were being generated
1974+automatically, faster than a single user would type them.
1975+
19711976 The solution is to avoid doing tree matches, and avoid doing parsing, as
19721977 much as possible. As of version 0.4, IDSgrep is capable of analysing a
19731978 dictionary in advance and generating an index file representing useful
@@ -1978,13 +1983,7 @@
19781983 index. Then for any entries not excluded by the index, it runs the more
19791984 expensive parsing and tree matching operations on the actual data. The time
19801985 saved by skipping input entries is supposed to more than compensate for the
1981-additional work of reading the index. The size of the effect varies a lot
1982-depending on the data, but in tests with reasonably representative
1983-dictionaries and sample queries, it averaged to about a factor of 15 speed
1984-improvement. If the index file is not sufficiently fresh, was built for a
1985-different computer architecture, or if there is no index file (for instance,
1986-with piped-in input), then IDSgrep will just do a tree match on every tree
1987-in the input, giving correct but slower results.
1986+additional work of reading the index.
19881987
19891988 Exactly how this feature works is complicated, and is part of the author's
19901989 academic research. This chapter starts with a short summary from a user's
@@ -2000,13 +1999,86 @@
20001999
20012000 \section{Building and using bit vector indices}
20022001
2003-FIXME
2002+If you run \texttt{make install} to install dictionaries, then the build
2003+system should build, install, and use bit vector indices for the installed
2004+dictionaries automatically.
20042005
2006+Getting the most out of bit vector indexing requires building the
2007+\texttt{idsgrep} command-line utility with the BuDDy binary decision
2008+diagrams library available at
2009+\url{http://sourceforge.net/projects/buddy/}~\cite{BuDDy}. Without it, bit
2010+vectors will still provide some speed improvement, but not as much.
2011+
2012+Bit vector indices are expected to increase the speed of searching by about
2013+a factor of 15 in typical use. The improvement factor varies a lot
2014+depending on a number of issues, and could be a thousand or more under
2015+optimal conditions. It should never be significantly less than one; that
2016+is, searching with a bit vector index should never take significantly longer
2017+than searching without one.
2018+
2019+Bit vectors provide the greatest benefit when the query is simple
2020+(exact-matching a single syrupy character is best); when the dictionary
2021+entries themselves are small; when the query, regardless of its form, only
2022+matches a small number of results; when the query includes exact matching of
2023+heads or functors at the root level of the EIDS tree (such as a query
2024+starting with ``\texttt{[lr]}'') or in the root's immediate children; and
2025+when the query does not include special matching operators such as regular
2026+expressions and user-defined predicates.
2027+
2028+Whenever the \texttt{texttt} utility reads a file whose pathname ends in
2029+``\texttt{.eids}''---regardless of whether that file was specified
2030+explicitly on the command line or indirectly via the \texttt{-d} option---it
2031+will look for an index file whose pathname is the same except with the
2032+\texttt{.eids} extension changed to \texttt{.bvec}. If such a file exists,
2033+can be read, has the correct 8-byte magic number at the start, \emph{and has
2034+a timestamp no older than the timestamp of the} \texttt{.eids} \emph{file},
2035+then \texttt{idsgrep} will assume it is a bit vector index and use it to
2036+speed up the query process. Note that all those conditions must be met. If
2037+any of the conditions fail to be met, no error will be reported, but the
2038+scanner will be forced to read and parse the entire input file without using
2039+bit vector filtering. Once the \emph{idsgrep} utility commits to start
2040+reading the index file past the header, it cannot switch to index-free
2041+searching and errors after that point will abort the search, just like
2042+errors in the EIDS input file.
2043+
2044+To create a bit vector index, use the \texttt{-G} option to
2045+\texttt{idsgrep}, as in \texttt{idsgrep -G dictionary.eids >
2046+dictionary.bvec}. Something like this will be necessary if the dictionary
2047+file changes or if you create a new one of your own. An outdated index file
2048+will usually be locked out by the timestamp check, but if you force the
2049+issue (for instance, by changing the timestamps with \texttt{touch}), the
2050+search will most likely abort with a parsing error when it hits the changed
2051+part of the dictionary file.
2052+
2053+Bit vector indices are only usable when reading from files with names ending
2054+in \texttt{.eids}. When reading from standard input or similar, or even
2055+just from files without \texttt{*.eids} filenames, bit vectors will not be
2056+used. Bit vectors are also not applicable to the internally-generated
2057+Unicode list associated with the \texttt{-U} option, although a somewhat
2058+similar feature (generating only the at most one entry that could match, if
2059+the query tree has a head) is always used where it applies. If you direct
2060+the \texttt{idsgrep} utility to read from multiple input sources in the same
2061+run, it will use bit vector indices on whichever input sources it can, even
2062+if that is not all of them.
2063+
2064+You can force \texttt{idsgrep} to ignore bit vector indices with the
2065+\texttt{-I} option; that is unlikely to be useful except during speed tests,
2066+but one could maybe imagine a case where it's absolutely necessary to have a
2067+file named \texttt{*.bvec} which is not a bit vector index and must not be
2068+touched.
2069+
2070+Using a (valid) bit vector index, or not using one, should only affect
2071+speed. It should never change which results are or are not returned from a
2072+query. If you manage to find a case where the same query and the same input
2073+file produce different hits depending on whether \texttt{-I} is used or a
2074+bit vector index generated by the same \texttt{idsgrep} binary, then that
2075+may be evidence of a bug; please report it.
2076+
20052077 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
20062078
2007-\section{Filtered matching}
2079+\section{Filtered matching}\DangerousSection
20082080
2009-Let \DangerousBend $\mathbb{E}$ be the set of EIDS trees. Let $\mathsf{T}$
2081+Let $\mathbb{E}$ be the set of EIDS trees. Let $\mathsf{T}$
20102082 and $\mathsf{F}$ represent Boolean true and false. The previous chapter
20112083 defined a function $\mathit{match}:\mathbb{E}\times\mathbb{E}\rightarrow
20122084 \{\mathsf{T},\mathsf{F}\}$ for determining whether one EIDS tree matches
@@ -2066,14 +2138,14 @@
20662138 This idea of doing an easy approximate check first, to save the cost of a
20672139 more difficult accurate evaluation, should be quite familiar. Imagine a
20682140 hiring committee quickly throwing out all the job applications from software
2069-engineers, then interviewing the computational geometers.\footnote{You may
2141+engineers, then interviewing the computational linguists.\footnote{You may
20702142 say I'm a dreamer.} Similar concepts appear throughout computer science:
20712143 consider instruction and data caches; memoization for dynamic programming;
20722144 solving relaxed versions of optimization problems; and especially Bloom
2073-filters~\cite{Bloom:Filters}. The bit vector technique used in IDSgrep
2074-builds on Bloom filters and on the work of Skala and
2075-others~\cite{Skala:Generalized} and Skala and Penn~\cite{Skala:Fast} on type
2076-unification.
2145+filters~\cite{Bloom:Space}. The bit vector technique used in IDSgrep builds
2146+on Bloom filters and on the work of Skala and
2147+others~\cite{Skala:Generalized} and Skala and Penn~\cite{Skala:Approximate}
2148+on unification of types in logic programming systems.
20772149
20782150 Functional programming weenies will note that the real point of an element
20792151 of $\mathbb{F}$ is that you can combine it with $\mathit{check}$ to make a
@@ -2139,9 +2211,9 @@
21392211
21402212 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
21412213
2142-\section{Lambda filters}
2214+\section{Lambda filters}\DangerousSection
21432215
2144-Consider \DangerousBend a very simple needle that matches if and only if the
2216+Consider a very simple needle that matches if and only if the
21452217 head of the haystack has a certain value. For instance, \texttt{<foo>!?}.
21462218 That matches anything with the head ``\texttt{foo}'' under the head-to-head
21472219 matching rule, and nothing else because \texttt{!?} (the inverse of
@@ -2256,9 +2328,9 @@
22562328
22572329 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
22582330
2259-\section{BDD filters}
2331+\section{BDD filters}\DangerousSection
22602332
2261-Lambda \DangerousBend filters do not capture everything we might want to
2333+Lambda filters do not capture everything we might want to
22622334 know about the bits in a vector. Consider the filters (with vector length
22632335 limited to four) $(0011,1)$ and $(1100,1)$. The OR of those filters in the
22642336 lambda filter calculus will be $(1111,1)$. But that will match if
@@ -2386,10 +2458,34 @@
23862458
23872459 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23882460
2389-\section{Implementation details}
2461+\section{Implementation details}\DangerousSection
23902462
2391-FIXME\DangerousBend
2463+Storing bit vectors requires storing two \texttt{uint64\_t}s and an
2464+\texttt{off\_t}, with possible alignment padding, per entry. That works out
2465+to 24 bytes on typical newer 64-bit and 32-bit platforms. Some older 32-bit
2466+platforms with 32-bit \texttt{off\_t}s may only require 20 bytes per index
2467+entry, trading off the decreased space against likely problems should a
2468+dictionary ever grow larger than 4G. For comparison, the average length of
2469+a dictionary entry in the CHISE, KanjiVG, and Tsukurimashou dictionaries as
2470+of this writing is about 40 bytes; four times that for EDICT2. So storing
2471+the indices represents a significant, but not crippling, increase in storage
2472+requirements. The performance improvement numbers do account for the time
2473+taken to read the additional data from disk---it just isn't enough to be a
2474+problem compared to the savings in parsing and tree matching.
23922475
2476+The format of the bit vector index file is specific to the version of
2477+IDSgrep and the computer architecture including the C compiler. If you use
2478+a bit vector file with a mismatched \texttt{idsgrep} binary, it will almost
2479+certainly be detected as invalid and ignored. Building with or without
2480+BuDDy will not make a difference to the file format; the difference is a
2481+different search algorithm applied to the same data.
2482+
2483+It is possible that the magic numbers may not
2484+perfectly track incompatible changes among different development versions
2485+of IDSgrep that you might check out of SVN, but they should definitely work
2486+correctly to differentiate between incompatible versions that have been
2487+formally released.
2488+
23932489 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23942490 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23952491 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- trunk/idsgrep/Makefile.am (revision 454)
+++ trunk/idsgrep/Makefile.am (revision 455)
@@ -300,6 +300,9 @@
300300 m4/ax_file_escapes.m4: ../m4/ax_file_escapes.m4
301301 $(TSU_V_CP) cp $< $@
302302
303+m4/ax_perl_module_version.m4: ../m4/ax_perl_module_version.m4
304+ $(TSU_V_CP) cp $< $@
305+
303306 m4/ax_print_to_file.m4: ../m4/ax_print_to_file.m4
304307 $(TSU_V_CP) cp $< $@
305308
--- trunk/idsgrep/match.c (revision 454)
+++ trunk/idsgrep/match.c (revision 455)
@@ -365,8 +365,62 @@
365365
366366 /**********************************************************************/
367367
368+typedef struct _MEMO_REC {
369+ int generation;
370+ NODE *needle,*haystack;
371+ MATCH_RESULT match_result;
372+} MEMO_REC;
373+
374+static int memo_mask=0,memo_gen=0;
375+static MEMO_REC *memo_table=NULL;
376+uint64_t memo_checks=UINT64_C(0),memo_hits=UINT64_C(0);
377+
378+
379+void check_memoization(void) {
380+ int hard_node_count;
381+ HASHED_STRING *hs;
382+return;
383+ /* References subtracted off:
384+ * 2 for the new_string() calls right here
385+ * 2 because . and * are single characters and therefore immortal
386+ * 2 for . being an opening and closing bracket
387+ * 2 for . and * being "special functors" with custom match functions
388+ * 2 for . and * having ASCII mnemonic aliases
389+ * total 10
390+ *
391+ * Any other references to these strings at this point, must be
392+ * from nodes in the parsed matching pattern.
393+ *
394+ * This number will have to change if the set of counted references
395+ * to these strings created before the call to check_memoization()
396+ * should change in the future.
397+ */
398+
399+ /* count refs to .*. */
400+ hs=new_string(1,"*");
401+ hard_node_count=hs->refs-6;
402+ delete_string(hs);
403+
404+ /* count refs to ... */
405+ hs=new_string(1,".");
406+ hard_node_count+=hs->refs-4;
407+ delete_string(hs);
408+
409+ /* decide whether to use memoization, and how big a table */
410+ if (hard_node_count>=3) {
411+ if (hard_node_count>22)
412+ hard_node_count=22;
413+ if (hard_node_count<10)
414+ hard_node_count=10;
415+ memo_table=(MEMO_REC *)malloc(sizeof(MEMO_REC)*(2<<hard_node_count));
416+ memo_mask=(2<<hard_node_count)-1;
417+ }
418+}
419+
368420 int tree_match(NODE *needle,NODE *haystack) {
369421 NODE *mn,*tmpn,*tmpnn;
422+ MEMO_REC memo_key;
423+ uint32_t hval;
370424
371425 mn=new_node();
372426 mn->nc_needle=needle;
@@ -374,18 +428,47 @@
374428 needle->refs++;
375429 haystack->refs++;
376430
431+ /* clear out the memoization table, first time and when counter wraps */
432+ if (memo_mask!=0) {
433+ memo_gen++;
434+ memo_key.generation=memo_gen;
435+ memo_key.match_result=MR_INITIAL;
436+ if (memo_gen==1)
437+ memset(memo_table,0,sizeof(MEMO_REC)*(memo_mask+1));
438+ }
439+
377440 while (1) {
378441 if (mn->match_result==MR_INITIAL) {
379- if ((mn->nc_needle->head!=NULL) && (mn->nc_haystack->head!=NULL)) {
380- mn->match_result=(mn->nc_needle->head==mn->nc_haystack->head)?
381- MR_TRUE:MR_FALSE;
382- } else if (mn->nc_needle->arity==mn->nc_needle->functor->arity) {
383- mn=mn->nc_needle->functor->match_fn(mn);
384- } else {
385- mn=default_match_fn(mn);
386- }
442+ if ((mn->nc_needle->head!=NULL) && (mn->nc_haystack->head!=NULL))
443+ mn->match_result=(mn->nc_needle->head==mn->nc_haystack->head)?
444+ MR_TRUE:MR_FALSE;
445+ else if (memo_mask && mn->nc_needle->complete) {
446+ memo_checks++;
447+ memo_key.needle=mn->nc_needle;
448+ memo_key.haystack=mn->nc_haystack;
449+ hval=fnv_hash(sizeof(MEMO_REC),(char *)&memo_key,0)&memo_mask;
450+ if ((memo_table[hval].generation==memo_key.generation)
451+ && (memo_table[hval].needle==memo_key.needle)
452+ && (memo_table[hval].haystack==memo_key.haystack)) {
453+ mn->match_result=memo_table[hval].match_result;
454+ memo_hits++;
455+ } else if (mn->nc_needle->arity==mn->nc_needle->functor->arity)
456+ mn=mn->nc_needle->functor->match_fn(mn);
457+ else
458+ mn=default_match_fn(mn);
459+ } else if (mn->nc_needle->arity==mn->nc_needle->functor->arity)
460+ mn=mn->nc_needle->functor->match_fn(mn);
461+ else
462+ mn=default_match_fn(mn);
387463 } else if (mn->match_parent!=NULL) {
388464 tmpn=mn->match_parent;
465+ if (memo_mask && mn->nc_needle->complete) {
466+ memo_key.needle=mn->nc_needle;
467+ memo_key.haystack=mn->nc_haystack;
468+ hval=fnv_hash(sizeof(MEMO_REC),(char *)&memo_key,0)&memo_mask;
469+ memo_table[hval]=memo_key;
470+ memo_table[hval].match_result=mn->match_result;
471+ }
389472 switch (mn->match_result) {
390473 case MR_TRUE:
391474 case MR_AND_MAYBE:
--- trunk/idsgrep/idsgrep.c (revision 454)
+++ trunk/idsgrep/idsgrep.c (revision 455)
@@ -556,6 +556,9 @@
556556 /* count explicit filenames */
557557 num_files=argc-optind;
558558
559+ /* prepare to memoize, if pattern is sufficiently complex */
560+ check_memoization();
561+
559562 /* generate Unicode list if requested */
560563 if (generate_list)
561564 generate_unicode_list(match_pattern,unilist_cfg);
@@ -597,8 +600,15 @@
597600 rub.ru_utime.tv_sec--;
598601 }
599602 printf("STATS %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64
600- " %" PRIu64 " %d.%06d %d ",
601- bv_checks,bv_hits,bdd_hits,tree_checks,tree_hits,
603+ " %" PRIu64 " %" PRIu64 " %" PRIu64 " %d.%06d %d ",
604+ bv_checks,bv_hits,
605+#ifdef HAVE_BUDDY
606+ bdd_hits,
607+#else
608+ 0,
609+#endif
610+ tree_checks,tree_hits,
611+ memo_checks,memo_hits,
602612 rub.ru_utime.tv_sec-rua.ru_utime.tv_sec,
603613 rub.ru_utime.tv_usec-rua.ru_utime.tv_usec,
604614 #ifdef HAVE_BUDDY
--- trunk/kleknev/Makefile.am (revision 454)
+++ trunk/kleknev/Makefile.am (revision 455)
@@ -213,6 +213,9 @@
213213 m4/ax_file_escapes.m4: ../m4/ax_file_escapes.m4
214214 $(TSU_V_CP) cp $< $@
215215
216+m4/ax_perl_module_version.m4: ../m4/ax_perl_module_version.m4
217+ $(TSU_V_CP) cp $< $@
218+
216219 m4/ax_print_to_file.m4: ../m4/ax_print_to_file.m4
217220 $(TSU_V_CP) cp $< $@
218221
--- trunk/configure.ac (revision 454)
+++ trunk/configure.ac (revision 455)
@@ -200,7 +200,7 @@
200200 AC_CONFIG_SRCDIR([hamlog/hamlog])
201201 AC_CONFIG_HEADERS([config.h])
202202 AC_CONFIG_MACRO_DIR([m4])
203-AC_REVISION([$Id: configure.ac 2019 2013-03-09 21:44:02Z mskala $])
203+AC_REVISION([$Id: configure.ac 2388 2013-08-05 22:37:41Z mskala $])
204204 AC_COPYRIGHT([Copyright (C) 2011, 2012, 2013 Matthew Skala])
205205 AC_SUBST([release_date],["March 7, 2013"])
206206 AM_SILENT_RULES
@@ -260,6 +260,15 @@
260260 # Checks for libraries.
261261 #
262262 AC_CHECK_LIB([m], [pow])
263+AX_PERL_MODULE_VERSION([XML::Parser 2.36],[have_perlxml=yes],
264+ [have_perlxml=no
265+ can_full_idsgrep=no])
266+AC_CHECK_LIB([bdd],[bdd_init],[have_buddy=yes],
267+ [have_buddy=no
268+ can_full_idsgrep=no])
269+PKG_CHECK_MODULES([PCRE],[libpcre],[have_pcre=yes],
270+ [have_pcre=no
271+ can_full_idsgrep=no])
263272 #
264273 ############################################################################
265274 #
@@ -876,6 +885,19 @@
876885 AS_ECHO(["fontlint are likely to report errors."])
877886 ])
878887
888+AS_IF([test '!' "x$enable_parasites" = "xnone" && test "$can_full_idsgrep" = "no"],
889+ [AS_ECHO([])
890+ AS_ECHO(["One or more optional libraries for IDSgrep are missing:"])
891+ AS_IF([test '!' "$have_perlxml" = "yes"],
892+ [AS_ECHO([" XML::Parser (>=2.36) http://search.cpan.org/search?module=XML::Parser"])])
893+ AS_IF([test '!' "$have_pcre" = "yes"],
894+ [AS_ECHO([" PCRE http://www.pcre.org"])])
895+ AS_IF([test '!' "$have_buddy" = "yes"],
896+ [AS_ECHO([" BuDDy https://sourceforge.net/projects/buddy/"])])
897+ AS_ECHO(["Without these things, IDSgrep can still be built if enabled, and will"])
898+ AS_ECHO(["still run, but some features may be missing."])
899+ ])
900+
879901 #
880902 # Prognosticate
881903 #
Show on old repository browser