• R/O
  • SSH
  • HTTPS

tsukurimashou: Commit


Commit MetaInfo

Revision311 (tree)
Time2012-08-17 12:21:52
Authormskala

Log Message

a lot more progress on cooked mode

Change Summary

Incremental Difference

--- trunk/idsgrep/parse.c (revision 310)
+++ trunk/idsgrep/parse.c (revision 311)
@@ -178,6 +178,8 @@
178178 break;
179179 }
180180 offs+=(i+1);
181+ if (i>9)
182+ xval=0xFFFD;
181183 if (eptr[i]!='}')
182184 continue;
183185
@@ -206,6 +208,8 @@
206208 eptr=ebuf;
207209 if (xval>=0x110000)
208210 xval=0xFFFD;
211+ if ((xval>=0xD800) && (xval<=0xDFFF))
212+ xval=0xFFFD;
209213 if (xval<0x80) {
210214 clen=1;
211215 ebuf[0]=xval;
@@ -312,7 +316,7 @@
312316 }
313317
314318 /* check for non-Unicode */
315- if (((eptr[0]==0xF4) && (eptr[2]&0xF0)>0x80) ||
319+ if ((((eptr[0]&0xFF)==0xF4) && (eptr[1]&0xF0)>0x80) ||
316320 ((eptr[0]&0xFF)>0xF4)) {
317321 offs+=4;
318322 continue;
@@ -561,7 +565,6 @@
561565 }
562566
563567 hs->arity=cs->arity;
564- hs->match_fn=cs->match_fn;
565568 hs->canonical=cs;
566569 cs->canonical=hs;
567570 }
--- trunk/idsgrep/idsgrep.tex (revision 310)
+++ trunk/idsgrep/idsgrep.tex (revision 311)
@@ -834,7 +834,9 @@
834834 Then the basic function of \texttt{idsgrep} is to take one EIDS as a
835835 matching pattern, scan a file containing many more, and write out the ones
836836 that match the matching pattern. The three major concepts are described,
837-one each, in the following sections.
837+one each, in the following sections. A final section describes options for
838+how the command-line \texttt{idsgrep} program
839+generates EIDS syntax on output.
838840
839841 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
840842
@@ -1100,7 +1102,9 @@
11001102 importance to the matching algorithm. Note that the single-character
11011103 versions are always the canonical ones, and although the brackets are
11021104 shown explicitly for clarity, they are nearly all characters from the
1103-``sugary implicit'' list.
1105+``sugary implicit'' list. This feature may be disabled or modified using
1106+some settings of the ``\texttt{-c}'' command-line option; see the section
1107+on output cooking for more information.
11041108
11051109 \texttt{\begin{tabular}{cccccc}
11061110 (anything) & $\Rightarrow$ & (?) & .anywhere. & $\Rightarrow$ & ... \\
@@ -1116,6 +1120,62 @@
11161120 \{tcb\} & $\Rightarrow$ & \{⿳\}
11171121 \end{tabular}}
11181122
1123+The \texttt{idsgrep} command-line utility attempts to follow Postel's Law
1124+with respect to byte sequences that are not valid UTF-8: ``be conservative
1125+in what you do, be liberal in what you accept from
1126+others.''~\cite{Postel:TCP} Jesus of Nazareth stated a similar principle
1127+somewhat earlier.\footnote{``There is nothing from without a man, that
1128+entering into him can defile him: but the things which come out of him,
1129+those are they that defile the man.''~(Mark 7:15, KJV)} Accordingly, invalid
1130+UTF-8 on input is not in general treated as a fatal error. Handling of
1131+invalid UTF-8 represents a delicate balance of security issues: if invalid
1132+UTF-8 is treated as completely fatal, that creates the possibility for
1133+denial of service attacks, but if it is permitted to too great an extent, it
1134+can create opportunities for things like buffer overflows. In general, the
1135+\texttt{idsgrep} utility will not itself break when given bad UTF-8, nor
1136+will it make matters worse compared to a system that did not include
1137+\texttt{idsgrep}, but \texttt{idsgrep} cannot be counted on to actively
1138+protect some other piece of software that would otherwise be vulnerable to
1139+bad UTF-8.\footnote{Genesis 4:9.}
1140+
1141+The parser will skip over (as if they did not exist at all) byte sequences
1142+that are not valid UTF-8, including the forbidden bytes 0xC0, 0xC1, and 0xF5
1143+through 0xFF; continuation bytes outside valid multibyte sequences;
1144+``overlong'' sequences (those that would otherwise be valid, but encode a
1145+given code point other than in the shortest possible way); surrogates; and
1146+sequences that encode code points outside the Unicode range. Depending on
1147+where they occur within a multibyte sequence, some of these things may
1148+result in the whole sequence being skipped instead of just the bad bytes,
1149+with the parser making its best guess as to what that means. Be aware that
1150+some other software may treat some of these things as valid.
1151+
1152+When a code point outside the Unicode range, or a surrogate, is specified
1153+using a backslash hexadecimal escape, the parser will interpret it as if the
1154+substitute character U+FFFD had been specified instead. All UTF-8 sequences
1155+\emph{actually generated by} the \texttt{idsgrep} program are guaranteed to
1156+be valid UTF-8, barring serious programming errors; and matching operations
1157+including PCRE matches occur only on the parsed internal representation
1158+which is valid UTF-8. Note that PCRE, despite having a deprecated syntax
1159+for sub-encoding byte matching, \emph{cannot} be used to detect invalid bytes
1160+that the \texttt{idsgrep} parser skipped; it sees only what the parser
1161+validly parsed. However, since in its default mode the \texttt{idsgrep}
1162+program will echo through to the output the exact input byte sequence that
1163+was parsed to create a tree, not the internal representation, it is possible
1164+that non-UTF-8 input could result in non-UTF-8 output. Several cooked
1165+output modes, in which \texttt{idsgrep} generates its own UTF-8 from the
1166+internal representation and provides guarantees of valid UTF-8 or even valid
1167+ASCII output, are available but non-default.
1168+
1169+Some byte sequences that are valid UTF-8 but not valid Unicode, for instance
1170+the sequence that encodes a reversed byte order mark, may possibly go
1171+undetected in the input and be allowed in the output, even when cooked, by
1172+the current version of \texttt{idsgrep}. It is intended that
1173+\texttt{idsgrep} should detect that kind of thing where it is reasonable to
1174+do so, and future versions may do it better than this one does; but some
1175+higher-level errors in Unicode usage, such as misuse of combining characters
1176+or variation selectors, will probably never fall within the scope of
1177+\texttt{idsgrep}.
1178+
11191179 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
11201180
11211181 \section{Matching}
@@ -1416,8 +1476,185 @@
14161476 the shell, interprets the backslash escape.
14171477
14181478 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1479+
1480+\section{Cooked output}
1481+
1482+The default mode of operation for the \texttt{idsgrep} command-line utility
1483+is that whenever a matching tree is detected, the exact sequence of bytes
1484+that were parsed to generate that tree (including no skipped whitespace
1485+before it, and all skipped whitespace after it but before the next tree)
1486+will be copied through to the output. This mode of operation is called
1487+``raw.'' Raw mode is easy to understand, efficient, preserves distinctions
1488+like different kinds of brackets in the input, and is as analogous as
1489+reasonably possible to the operation of \texttt{grep}. However, preserving
1490+the exact input bytes may preserve invalid UTF-8, valid but weird EIDS
1491+syntax, or non-ASCII characters users may find difficult to type or
1492+display, that may have existed in the input. The ``\texttt{-c}''
1493+(``\texttt{--cooking}'') command-line option provides a wide range of
1494+ways for \texttt{idsgrep} to generate new EIDS syntax of its own, guaranteed
1495+to be valid, from the internal representation generated by the parser. The
1496+cooked output modes force the output into a well-behaved format
1497+independent of what the input looked like.
1498+Input canonicalization (such as the translation from ``\texttt{[lr]}'' to
1499+``\texttt{⿰}'') can also be controlled through this interface.
1500+
1501+The ``\texttt{-c}'' option can be given a (lowercase ASCII Latin,
1502+unabbreviated) keyword as
1503+its argument, to select a preset output mode. That is the only recommended
1504+way to use this option. The available preset modes are as follows:
1505+\begin{description}
1506+ \item[\texttt{raw}] Raw mode: write out the exact input byte sequence
1507+ that was parsed to generate the matching tree, \emph{even if it is not
1508+ valid UTF-8}. This is the default.
1509+ \item[\texttt{rawnc}] Raw with no canonicalization: raw mode output,
1510+ but without the canonicalization transformation during input parsing.
1511+ \item[\texttt{ascii}] ASCII-only: all non-ASCII characters and ASCII
1512+ control characters are replaced by escape sequences or subjected to
1513+ the reverse of the input canonicalization transformation, to produce a
1514+ result that should pass through most limited-character-set channels.
1515+ Note that the plainest ASCII space (U+0020) is not escaped in this mode
1516+ when EIDS syntax does not require it to be. This mode generally uses a
1517+ lot of hexadecimal escapes and, in a dictionary-lookup context, may be
1518+ useful for finding the hexadecimal code point value of an unknown
1519+ character.
1520+ \item[\texttt{cooked}] Generic cooked mode: render trees as reasonably
1521+ clean and appealing Unicode text similar but not necessarily identical
1522+ to what appears in the pregenerated dictionary files. This will escape
1523+ characters outside the Basic Multicharacter Plane; characters in all
1524+ Private Use Areas; and any other characters that EIDS syntax
1525+ \emph{requires} must be escaped; but no others. It will choose an
1526+ appropriate escaping method depending on the type of character.
1527+ Generally, it will use black lenticular brackets for top-level heads,
1528+ ASCII brackets elsewhere, and syntactic sugar and syrup to avoid
1529+ brackets where possible (except for top-level heads).
1530+ \item[\texttt{indent}] Write trees on multiple lines with two-space
1531+ indentation to show their structure as clearly as possible. One blank
1532+ line (two newlines) between trees. In other ways this is similar to
1533+ ``\texttt{cooked}.''
1534+\end{description}
1535+
1536+If not given a preset keyword, ``\texttt{-c}'' can be given a string of
1537+ASCII decimal digits. The decimal-string interface allows precise control
1538+of how output syntax will be generated, but it is somewhat experimental,
1539+very complicated, and may change incompatibly in future versions of this
1540+software. Use of this feature is not recommended. Nonetheless, the
1541+remainder of this section will attempt to document it.
1542+
1543+The format specifier may be up to twelve digits long. If it is shorter than
1544+that, it is taken as a prefix with unspecified digits copied from the
1545+default specifier, which is ``\texttt{100000013250}'' and equivalent to the
1546+``\texttt{cooked}'' preset. The two raw presets are handled as special
1547+cases; of the remaining cooked presets, ``\texttt{ascii}'' is equivalent to
1548+``\texttt{000000013551}'' and ``\texttt{indent}'' is equivalent to
1549+``\texttt{100000223250}.''
1550+
1551+The first digit specifies the type of brackets to be used for the head of
1552+the root of the tree: 0 for ``\texttt{<>},'' 1 for ``\texttt{【】},'' or 2
1553+for ``\texttt{〖〗}.'' The second digit specifies the type of brackets for
1554+the head of any non-root node, using the same code.
1555+
1556+The third digit specifies the type of brackets for nullary functors: 0 for
1557+``\texttt{()},'' 1 for ``\texttt{()},'' or 2 for ``\texttt{⦅⦆}.''
1558+Similarly, the fourth digit specifies the brackets for unary functors:
1559+0 for ``\texttt{..},'' 1 for ``\texttt{::},'' or 2 for ``\texttt{・・}'';
1560+the fifth digit specifies the brackets for binary functors:
1561+0 for ``\texttt{[]},'' 1 for ``\texttt{[]},'' or 2 for ``\texttt{〚〛}'';
1562+and the sixth digit specifies the brackets for ternary functors:
1563+0 for ``\texttt{\{\}},'' 1 for ``\texttt{〔〕},'' or 2 for
1564+``\texttt{〘〙}''.
1565+
1566+The seventh digit describes how to insert newlines and indentation to
1567+pretty-print the tree structure. If it is 0, that will not be done. If it
1568+is 8, trees will be pretty-printed using one tab character per level; the
1569+number eight is a mnemonic for the fact that people generally expect those
1570+to be equivalent to eight spaces each. Any other decimal digit specifies
1571+that many spaces per level.
1572+
1573+The eighth digit specifies the separator printed between trees: 0 for a null
1574+byte (U+0000), 1 for a newline, 2 for two newlines, or 3 for no separator at
1575+all.
1576+
1577+The ninth digit specifies the circumstances under which the sugary and
1578+syrupy features of EIDS syntax should be used. It is a sum of binary flags:
1579+add 4 to use a syrupy semicolon when possible at the top level;
1580+2 to use a syrupy semicolon when possible at other levels;
1581+and 1 to use sugary implicit brackets wherever possible.
1582+
1583+The tenth digit specifies which characters should be escaped. Literal
1584+backslashes, and (within a bracketed string) literal instances of the
1585+close-bracket character that would otherwise end the string, must
1586+always be escaped.
1587+When the tenth digit is 0, those are the only characters that will be escaped.
1588+Other values add escaping for the following categories of characters, and do
1589+so cumulatively with each digit also escaping everything that would be
1590+escaped by all lesser digits.
1591+\begin{description}
1592+ \item[1] Escape characters from the astral planes; that is,
1593+ characters with code points greater than U+FFFF and thus outside the
1594+ Basic Multilingual Plane.
1595+ \item[2] Escape characters from the BMP Private Use Areas, U+E000 to
1596+ U+F8FF. The other Private Use Areas are already escaped at level 1
1597+ by virtue of being outside the BMP.
1598+ \item[3] Escape all non-ASCII characters (U+0080 and up) except the core
1599+ Unified Han range (U+4E00 to U+9FFF).
1600+ \item[4] Escape the core Unified Han range.
1601+ \item[5] Escape the ASCII control characters (U+0000 to U+001F).
1602+ \item[6] Escape closing brackets at the start of bracketed strings, which
1603+ otherwise escape escaping because of a special case in
1604+ the syntax definition.
1605+ \item[7] Escape all characters. Depending on the value of the next digit,
1606+ however, the ASCII Latin alphabet still might not be escaped.
1607+\end{description}
1608+
1609+The eleventh digit specifies \emph{how} to escape whatever characters were
1610+selected for escaping by the tenth digit. The available values are as
1611+follows.
1612+\begin{description}
1613+ \item[0] Use a single backslash followed by the literal character, only.
1614+ The ASCII Latin alphabet
1615+ cannot be escaped in this way and (under this option, or options 1 or 5,
1616+ which fall through to this case) will not be escaped at all. Since the
1617+ literal characters remain in the text, this option is not
1618+ suitable for sending output through any channel that is not fully
1619+ UTF-8-clean.
1620+ \item[1] Use a backslash-letter sequence for ASCII control characters
1621+ U+0001 to U+001B, and otherwise follow option 0.
1622+ \item[2] Use variable-length hexadecimal ``\texttt{\textbackslash x\{\}}''
1623+ sequences for all characters that are selected to escape. This syntax
1624+ can escape any character.
1625+ \item[3] Use two-digit ``\texttt{\textbackslash x}$HH$'' sequences
1626+ wherever possible (that is, for ASCII and ISO-8859-1 characters),
1627+ four-digit ``\texttt{\textbackslash X}$HHHH$'' sequences for
1628+ other characters on the Basic Multilingual Plane,
1629+ and variable-length hexadecimal sequences otherwise.
1630+ \item[4] Use four-digit ``\texttt{\textbackslash X}$HHHH$'' sequences
1631+ wherever possible (that is, for all characters on the BMP), and
1632+ variable-length hexadecimal sequences otherwise.
1633+ \item[5] Attempt to choose the simplest type of escape
1634+ for each character depending on its value, just like option 3 except
1635+ with backslash-letter escapes where possible (U+0001 to U+001B) and
1636+ backslash-literal escapes for ASCII non-control characters
1637+ (U+0020 to U+007E excluding the Latin alphabet). The ASCII Latin
1638+ alphabet will not be escaped at all under this option.
1639+\end{description}
1640+
1641+The twelfth digit specifies canonicalization processing; that is, the
1642+translations on both input and output between alphabetic functor aliases
1643+like ``\texttt{(anything)}'' and their symbolic equivalents like
1644+``\texttt{(?)}.'' Note that in all cases the symbolic versions are the
1645+matching operators; if you disable input canonicalization and enter a
1646+matching pattern of ``\texttt{(anything)}'' it will be matched as an ordinary
1647+nullary functor containing a string of eight ASCII letters, not as the
1648+match-anything operator which is always named ``\texttt{(?)}.'' The digit
1649+value is a sum of binary flags: add 4 to \emph{disable} the default
1650+transformation of alphabetic aliases to symbolic names on input; plus 2 to
1651+enable a translation from alphabetic aliases to symbolic names on output,
1652+which is generally only meaningful if 4 was selected; plus 1 to enable a
1653+transformation from symbolic names back to alphabetic aliases on output.
1654+
14191655 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
14201656 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1657+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
14211658
14221659 \clearpage
14231660 \addcontentsline{toc}{chapter}{Bibliography}
--- trunk/idsgrep/Makefile.am (revision 310)
+++ trunk/idsgrep/Makefile.am (revision 311)
@@ -119,8 +119,9 @@
119119
120120 GCOV_TESTS = \
121121 test/andor test/anynot test/assoc test/basicmatch test/backslash \
122- test/bighash test/demorgan test/equal test/kvg-grone test/messages \
123- test/regex test/spacing test/tsu-grone test/unord test/utf8
122+ test/bighash test/cooked test/demorgan test/equal test/kvg-grone \
123+ test/messages test/regex test/spacing test/tsu-grone test/unord \
124+ test/utf8
124125
125126 define GCDEP_RECIPE
126127 $1.log: test/rmgcda.log
--- trunk/idsgrep/cook.c (revision 310)
+++ trunk/idsgrep/cook.c (revision 311)
@@ -28,27 +28,31 @@
2828 int cook_output=0;
2929 int canonicalize_input=1;
3030
31-#define OS_TOP_HEAD_BRACKET_TYPE 0
32-#define OS_INNER_HEAD_BRACKET_TYPE 1
33-#define OS_NULLARY_BRACKET_TYPE 2
34-#define OS_UNARY_BRACKET_TYPE 3
35-#define OS_BINARY_BRACKET_TYPE 4
36-#define OS_TERNARY_BRACKET_TYPE 5
37-#define OS_INDENTATION 6
38-#define OS_SEPARATOR 7
39-#define OS_SUGAR 8
40-#define OS_ESCAPE_WHAT 9
41-#define OS_ESCAPE_HOW 10
42-#define OS_CANONICAL 11
31+#define OS_TOP_HEAD_BRACKET_TYPE 0 /* 0 ASCII, 1 B lentic., 2 W lentic. */
32+#define OS_INNER_HEAD_BRACKET_TYPE 1 /* like OS_TOP_HEAD_BRACKET_TYPE */
33+#define OS_NULLARY_BRACKET_TYPE 2 /* 0 paren, 1 wide paren, 2 dbl wide */
34+#define OS_UNARY_BRACKET_TYPE 3 /* 0 period, 1 colon, 2 centre dot */
35+#define OS_BINARY_BRACKET_TYPE 4 /* 0 square bckt, 1 wide, 2 dbl wide */
36+#define OS_TERNARY_BRACKET_TYPE 5 /* 0 curly brace, 1 B tort, 2 W tort */
37+#define OS_INDENTATION 6 /* 8=tab, else # of spaces */
38+#define OS_SEPARATOR 7 /* 0 null, 1 \n, 2 \n\n, 3 nothing */
39+#define OS_SUGAR 8 /* 4 syrup @top +2 not @top +1 sugar */
40+#define OS_ESCAPE_WHAT 9 /* increasing subsets from 0 to 7 */
41+#define OS_ESCAPE_HOW 10 /* 0-5: \ \alpha \x{} \x \X smart */
42+#define OS_CANONICAL 11 /* 4 don't on input +2 a->u +1 u->a */
4343
4444 #define NUM_OUTPUT_SETTINGS 12
4545
46-static char output_recipe[NUM_OUTPUT_SETTINGS]="111111111111";
46+static char output_recipe[NUM_OUTPUT_SETTINGS]="100000013250";
4747
48-#define NUM_PRESET_RECIPES 1
48+#define NUM_PRESET_RECIPES 5
4949
5050 static struct {char *name,*recipe;} preset_recipe[NUM_PRESET_RECIPES]={
51- {"",""},
51+ {"ascii", "000000013551"},
52+ {"cooked","100000013250"},
53+ {"indent","100000223250"},
54+ {"raw", "000000000000"},
55+ {"rawnc", "000000000004"},
5256 };
5357
5458 /**********************************************************************/
@@ -96,125 +100,137 @@
96100
97101 /**********************************************************************/
98102
99-void write_bracketed_string(HASHED_STRING *hs,HASHED_STRING *br) {
100- int i,do_esc,c;
103+void write_maybe_escaped_char(char *cp,HASHED_STRING *br) {
104+ int c,do_esc;
101105
102- fwrite(br->data,1,br->length,stdout);
103- for (i=0;i<hs->length;i+=char_length(hs->data+i)) {
104-
105- if (hs->data[i]=='\\') /* backslash - always escape */
106- do_esc=1;
107- else if ((i+br->mate->length<=hs->length) &&
108- (strncmp(hs->data+i,br->mate->data,br->mate->length)==0))
109- do_esc=((i>0) || (output_recipe[OS_ESCAPE_WHAT]>='6'));
110- else if (((unsigned char)hs->data[i])>=0xF0) /* astral planes */
111- do_esc=output_recipe[OS_ESCAPE_WHAT]>='1';
112- else if ((((unsigned char)hs->data[i])>=0xE3) &&
113- (((unsigned char)hs->data[i])<=0xED)) /* mainline CJK */
114- do_esc=output_recipe[OS_ESCAPE_WHAT]>='4';
115- else if (((unsigned char)hs->data[i])==0xEE) /* low PUA */
116- do_esc=output_recipe[OS_ESCAPE_WHAT]>='2';
117- else if (((unsigned char)hs->data[i])>=0x80) /* non-ASCII */
118- do_esc=output_recipe[OS_ESCAPE_WHAT]>='3';
119- else if (((unsigned char)hs->data[i])<=0x20) /* ASCII controls */
120- do_esc=output_recipe[OS_ESCAPE_WHAT]>='5';
121- else
122- do_esc=output_recipe[OS_ESCAPE_WHAT]>='7'; /* all others */
123-
124- if (do_esc) {
125- switch (char_length(hs->data+i)) {
126- case 1:
127- c=(unsigned char)hs->data[i];
106+ switch (char_length(cp)) {
107+ case 1:
108+ c=(unsigned char)cp[0];
109+ break;
110+ case 2:
111+ c=(((unsigned char)cp[0]&0x1F)<<6)|
112+ ((unsigned char)cp[1]&0x3F);
113+ break;
114+ case 3:
115+ c=(((unsigned char)cp[0]&0x0F)<<12)|
116+ (((unsigned char)cp[1]&0x3F)<<6)|
117+ ((unsigned char)cp[2]&0x3F);
118+ break;
119+ case 4:
120+ c=(((unsigned char)cp[0]&0x07)<<18)|
121+ (((unsigned char)cp[1]&0x3F)<<12)|
122+ (((unsigned char)cp[2]&0x3F)<<6)|
123+ ((unsigned char)cp[3]&0x3F);
124+ break;
125+ default:
126+ puts("internal error, inconsistent char length"); /* SNH */
127+ exit(1); /* SNH */
128+ }
129+
130+ if (c=='\\') /* backslash - always escape */
131+ do_esc=1;
132+ else if ((br!=NULL) && (strncmp(cp,br->data,br->length)==0))
133+ /* must escape closing bracket if we were told about it */
134+ do_esc=1;
135+ else if (c>0xFFFF) /* astral planes */
136+ do_esc=output_recipe[OS_ESCAPE_WHAT]>='1';
137+ else if ((c>=0x4E00) && (c<=0x9FFF)) /* mainline CJK */
138+ do_esc=output_recipe[OS_ESCAPE_WHAT]>='4';
139+ else if ((c>=0xE000) && (c<=0xF8FF)) /* low PUA */
140+ do_esc=output_recipe[OS_ESCAPE_WHAT]>='2';
141+ else if (c>=0x80) /* non-ASCII */
142+ do_esc=output_recipe[OS_ESCAPE_WHAT]>='3';
143+ else if (c<0x20) /* ASCII controls */
144+ do_esc=output_recipe[OS_ESCAPE_WHAT]>='5';
145+ else /* all others */
146+ do_esc=output_recipe[OS_ESCAPE_WHAT]>='7';
147+
148+ if (do_esc) {
149+ switch (output_recipe[OS_ESCAPE_HOW]) {
150+
151+ case '5':
152+ case '3':
153+ if (((output_recipe[OS_ESCAPE_HOW]=='3') || (c>=0x7F)) &&
154+ (c<=0xFF)) {
155+ printf("\\x%02X",c);
128156 break;
129- case 2:
130- c=(((unsigned char)hs->data[i]&0x1F)<<6)|
131- ((unsigned char)hs->data[i+1]&0x3F);
157+ }
158+ /* FALL THROUGH */
159+
160+ case '4':
161+ if (((output_recipe[OS_ESCAPE_HOW]=='4') || (c>0xFF)) &&
162+ (c<=0xFFFF)) {
163+ printf("\\X%04X",c);
132164 break;
133- case 3:
134- c=(((unsigned char)hs->data[i]&0x0F)<<12)|
135- (((unsigned char)hs->data[i+1]&0x3F)<<6)|
136- ((unsigned char)hs->data[i+2]&0x3F);
165+ }
166+ /* FALL THROUGH */
167+
168+ case '2':
169+ if ((output_recipe[OS_ESCAPE_HOW]=='2') || (c>0xFFFF)) {
170+ printf("\\x{%X}",c);
137171 break;
138- case 4:
139- c=(((unsigned char)hs->data[i]&0x07)<<18)|
140- (((unsigned char)hs->data[i+1]&0x3F)<<12)|
141- (((unsigned char)hs->data[i+2]&0x3F)<<6)|
142- ((unsigned char)hs->data[i+3]&0x3F);
143- break;
144- default:
145- puts("internal error, inconsistent char length"); /* SNH */
146- exit(1); /* SNH */
147172 }
173+ /* FALL THROUGH */
148174
149- switch (output_recipe[OS_ESCAPE_HOW]) {
150-
151- case '5':
152- case '4':
153- if (((output_recipe[OS_ESCAPE_HOW]=='4') || (c>0xFF)) &&
154- (c<=0xFFFF)) {
155- printf("\\X%04X",c);
175+ case '1':
176+ if ((c>=1) && (c<=27)) {
177+ fputc('\\',stdout);
178+ switch (c) {
179+ case 7:
180+ fputc('a',stdout);
156181 break;
157- }
158- /* FALL THROUGH */
159-
160- case '3':
161- if (((output_recipe[OS_ESCAPE_HOW]=='3') || (c>0x7F)) &&
162- (c<=0xFF)) {
163- printf("\\x%02X",c);
182+ case 8:
183+ fputc('b',stdout);
164184 break;
165- }
166- /* FALL THROUGH */
167-
168- case '2':
169- if ((output_recipe[OS_ESCAPE_HOW]=='2') || (c>0xFFFF)) {
170- printf("\\x{%X}",c);
185+ case 27:
186+ fputc('e',stdout);
171187 break;
172- }
173- /* FALL THROUGH */
174-
175- case '1':
176- if ((c>=1) && (c<=27)) {
177- fputc('\\',stdout);
178- switch (c) {
179- case 7:
180- fputc('a',stdout);
181- break;
182- case 8:
183- fputc('b',stdout);
184- break;
185- case 27:
186- fputc('e',stdout);
187- break;
188- case 12:
189- fputc('f',stdout);
190- break;
191- case 9:
192- fputc('t',stdout);
193- break;
194- case 10:
195- fputc('n',stdout);
196- break;
197- case 13:
198- fputc('r',stdout);
199- break;
200- default:
201- fputc('c',stdout);
202- fputc(c+'A'-1,stdout);
203- break;
204- }
188+ case 12:
189+ fputc('f',stdout);
205190 break;
191+ case 9:
192+ fputc('t',stdout);
193+ break;
194+ case 10:
195+ fputc('n',stdout);
196+ break;
197+ case 13:
198+ fputc('r',stdout);
199+ break;
200+ default:
201+ fputc('c',stdout);
202+ fputc(c+'A'-1,stdout);
203+ break;
206204 }
207- /* FALL THROUGH */
208-
209- case '0':
210- default:
205+ break;
206+ }
207+ /* FALL THROUGH */
208+
209+ case '0':
210+ default:
211+ if ((output_recipe[OS_ESCAPE_HOW]=='5') &&
212+ ((c<=0x1F) || (c==0x7F))) {
213+ printf("\\x%02X",c);
214+ } else {
211215 if (((c|0x20)<'a') || ((c|0x20)>'z'))
212216 fputc('\\',stdout);
213- fwrite(hs->data+i,1,char_length(hs->data+i),stdout);
214- break;
215- }
216- } else
217- fwrite(hs->data+i,1,char_length(hs->data+i),stdout);
217+ fwrite(cp,1,char_length(cp),stdout);
218+ }
219+ break;
220+ }
221+ } else
222+ fwrite(cp,1,char_length(cp),stdout);
223+}
224+
225+void write_bracketed_string(HASHED_STRING *hs,HASHED_STRING *br) {
226+ int i;
227+
228+ fwrite(br->data,1,br->length,stdout);
229+ for (i=0;i<hs->length;i+=char_length(hs->data+i)) {
230+ if ((i==0) && (output_recipe[OS_ESCAPE_WHAT]<'6'))
231+ write_maybe_escaped_char(hs->data+i,NULL);
232+ else
233+ write_maybe_escaped_char(hs->data+i,br->mate);
218234 }
219235 fwrite(br->mate->data,1,br->mate->length,stdout);
220236 }
@@ -255,7 +271,7 @@
255271 (ms->head->arity==-2) &&
256272 (ms->arity==0) &&
257273 (ms->functor==semicolon)) {
258- fwrite(ms->head->data,ms->head->length,1,stdout);
274+ write_maybe_escaped_char(ms->head->data,NULL);
259275
260276 } else {
261277
@@ -286,7 +302,8 @@
286302 mf=ms->functor;
287303
288304 if ((output_recipe[OS_SUGAR]&1) &&
289- (ms->functor->arity==ms->arity) &&
305+ (mf->arity==ms->arity) &&
306+ (mf->mate==NULL) &&
290307 (char_length(mf->data)==mf->length)) {
291308 fwrite(mf->data,mf->length,1,stdout);
292309
Show on old repository browser