Regex-specific escaping (#39094)
@@ -28,6 +28,9 @@ | ||
28 | 28 | shell is now always implicitly redirected to /dev/null, |
29 | 29 | regardless of whether the standard input has already been |
30 | 30 | redirected. |
31 | + = Quoted characters are now handled in (almost) the same way as | |
32 | + Bash in the regular expression in the "[[ word =~ regex ]]" | |
33 | + syntax. | |
31 | 34 | * The "command" built-in with the -v or -V option was printing |
32 | 35 | the pathnames of external commands with a redundant leading slash |
33 | 36 | when the current working directory is "/" or "//". |
@@ -45,7 +48,7 @@ | ||
45 | 48 | character. |
46 | 49 | * When there are no positional parameters, the nested expansion |
47 | 50 | "${{@}}" now expands to nothing rather than one empty field. |
48 | - * Unquoted parentheses and vertical bars now can be used in a | |
51 | + * Unquoted parentheses and vertical bars now can be used in the | |
49 | 52 | regular expression in the "[[ word =~ regex ]]" syntax. |
50 | 53 | |
51 | 54 | ---------------------------------------------------------------------- |
@@ -97,6 +97,8 @@ | ||
97 | 97 | __attribute__((nonnull)); |
98 | 98 | static wchar_t *quote_removal_for_regex(const wchar_t *s, const char *cc) |
99 | 99 | __attribute__((nonnull,malloc,warn_unused_result)); |
100 | +static const wchar_t *skip_bracket(const wchar_t *s) | |
101 | + __attribute__((nonnull,pure,warn_unused_result)); | |
100 | 102 | #endif |
101 | 103 | |
102 | 104 |
@@ -871,19 +873,78 @@ | ||
871 | 873 | * string. */ |
872 | 874 | wchar_t *quote_removal_for_regex(const wchar_t *s, const char *cc) |
873 | 875 | { |
874 | - xwcsbuf_T result; | |
875 | - wb_initwithmax(&result, mul(wcslen(s), 2)); | |
876 | + /* First, remove quotations. */ | |
877 | + xwcsbuf_T tmp; | |
878 | + xstrbuf_T tmpcc; | |
879 | + size_t sizehint = wcslen(s); | |
880 | + wb_initwithmax(&tmp, sizehint); | |
881 | + sb_initwithmax(&tmpcc, sizehint); | |
876 | 882 | for (size_t i = 0; s[i] != L'\0'; i++) { |
877 | 883 | if (cc[i] & CC_QUOTATION) |
878 | 884 | continue; |
879 | - if (cc[i] & CC_QUOTED) | |
880 | - if (wcschr(L"^.[$()|*+?{\\", s[i]) != NULL) | |
885 | + wb_wccat(&tmp, s[i]); | |
886 | + sb_ccat(&tmpcc, cc[i]); | |
887 | + } | |
888 | + | |
889 | + /* Next, escape unquoted special chars outside brackets */ | |
890 | + xwcsbuf_T result; | |
891 | + wb_initwithmax(&result, sizehint); | |
892 | + for (size_t i = 0; tmp.contents[i] != L'\0'; ) { | |
893 | + if (tmpcc.contents[i] & CC_QUOTED) { | |
894 | + if (wcschr(L"^.[$()|*+?{\\", tmp.contents[i]) != NULL) | |
881 | 895 | wb_wccat(&result, L'\\'); |
882 | - wb_wccat(&result, s[i]); | |
896 | + wb_wccat(&result, tmp.contents[i++]); | |
897 | + } else { | |
898 | + if (tmp.contents[i] != L'[') { | |
899 | + wb_wccat(&result, tmp.contents[i++]); | |
900 | + } else { | |
901 | + const wchar_t *s2 = skip_bracket(&tmp.contents[i]); | |
902 | + size_t j = s2 - tmp.contents; | |
903 | + while (i < j) | |
904 | + wb_wccat(&result, tmp.contents[i++]); | |
905 | + } | |
906 | + } | |
883 | 907 | } |
908 | + | |
909 | + sb_destroy(&tmpcc); | |
910 | + wb_destroy(&tmp); | |
884 | 911 | return wb_towcs(&result); |
885 | 912 | } |
886 | 913 | |
914 | +/* Skips a bracket expression in a regular expression pattern. | |
915 | + * Returns a pointer to the character just after the closing L']' (or the | |
916 | + * terminating null character). */ | |
917 | +const wchar_t *skip_bracket(const wchar_t *s) | |
918 | +{ | |
919 | + assert(*s == L'['); | |
920 | + s++; | |
921 | + | |
922 | + if (*s == L'^') | |
923 | + s++; | |
924 | + if (*s == L']') | |
925 | + s++; | |
926 | + | |
927 | + while (*s != L'\0') { | |
928 | + if (*s == L']') | |
929 | + return s + 1; | |
930 | + if (*s++ != L'[') | |
931 | + continue; | |
932 | + | |
933 | + switch (*s) { | |
934 | + case L':': case L'.': case L'=': ; | |
935 | + wchar_t end[] = { *s, L']', L'\0', }; | |
936 | + s++; | |
937 | + const wchar_t *endp = wcsstr(s, end); | |
938 | + if (endp == NULL) | |
939 | + return s + wcslen(s); | |
940 | + s = endp + 2; | |
941 | + break; | |
942 | + } | |
943 | + } | |
944 | + | |
945 | + return s; | |
946 | +} | |
947 | + | |
887 | 948 | #endif /* YASH_ENABLE_DOUBLE_BRACKET */ |
888 | 949 | |
889 | 950 |