• R/O
  • SSH
  • HTTPS

yash: Commit


Commit MetaInfo

Revision4147 (tree)
Time2020-11-12 00:39:35
Authormagicant

Log Message

Faster wglob (#40749)

Change Summary

Incremental Difference

--- yash/trunk/path.c (revision 4146)
+++ yash/trunk/path.c (revision 4147)
@@ -551,9 +551,8 @@
551551
552552 /********** wglob **********/
553553
554-/* The type of compiled glob patterns */
554+/* Parsed glob pattern component */
555555 struct wglob_pattern {
556- struct wglob_pattern *next;
557556 enum {
558557 WGLOB_LITERAL, WGLOB_MATCH, WGLOB_RECSEARCH,
559558 } type;
@@ -560,7 +559,7 @@
560559 union {
561560 struct {
562561 char *name;
563- const wchar_t *wname;
562+ wchar_t *wname;
564563 } literal;
565564 struct {
566565 xfnmatch_T *pattern;
@@ -571,57 +570,92 @@
571570 } value;
572571 };
573572
574-struct wglob_dirstack {
575- struct wglob_dirstack *prev;
573+/* Data used in search */
574+struct wglob_search {
575+ plist_T pattern;
576+ enum wglobflags_T flags;
577+ xstrbuf_T path;
578+ xwcsbuf_T wpath;
579+ plist_T *results;
580+};
581+/* `pattern' is an array of pointers to struct wglob_pattern objects. Each
582+ * wglob_pattern object is called a "component", which corresponds to one
583+ * segment of the entire pattern that is split at slashes.
584+ * `path' and `wpath' are intermediate pathnames, denoting the currently
585+ * searched directory. They are the multi-byte and wide string versions of the
586+ * same pathname. The multi-byte version is mainly used for calling OS APIs and
587+ * the wide version for producing the final results. */
588+
589+/* Data used in search for one level of directory */
590+struct wglob_stack {
591+ const struct wglob_stack *prev;
576592 struct stat st;
593+ unsigned char active_components[];
577594 };
595+/* `st' is mainly used to detect recursion into the same directory and prevent
596+ * infinite search.
597+ * The length of `active_components' is the same as that of `pattern' in `struct
598+ * wglob_search'. When an item of `active_components' is zero, the component is
599+ * not active. When non-zero, it is active. For a recursive search component,
600+ * the value is the depth of the current recursion. */
578601
579-static struct wglob_pattern *wglob_parse_pattern(
580- wchar_t *pat, enum wglobflags_T flags)
581- __attribute__((malloc,warn_unused_result,nonnull));
582-static struct wglob_pattern *wglob_create_recsearch_pattern(
602+/* The wglob search algorithm used to perform naive search, but it was slow when
603+ * the pattern contained more than one recursive search component */
604+// (e.g. foo/**/bar/**/baz)
605+/* We now implement a state-machine-based algorithm that tries to avoid
606+ * searching the same directory more than once. During search, we keep track of
607+ * "active components" in the pattern, which are the components that correspond
608+ * to the currently searched intermediate path. In a recursive search, a single
609+ * directory may match more than one component, so there may be more than one
610+ * active component at a time. We scan all the active components for each
611+ * intermediate path and produce a set of active components for the next path.
612+ */
613+
614+static plist_T wglob_parse_pattern(
615+ const wchar_t *pattern, enum wglobflags_T flags)
616+ __attribute__((nonnull,warn_unused_result));
617+static struct wglob_pattern *wglob_parse_component(
618+ const wchar_t *pattern, enum wglobflags_T flags, bool has_next)
619+ __attribute__((nonnull,malloc,warn_unused_result));
620+static struct wglob_pattern *wglob_create_recsearch_component(
583621 bool followlink, bool allowperiod)
584622 __attribute__((malloc,warn_unused_result));
585-static struct wglob_pattern *wglob_parse_pattern_part(
586- wchar_t *pat, enum wglobflags_T flags)
587- __attribute__((malloc,warn_unused_result,nonnull));
588-static void wglob_free_pattern(struct wglob_pattern *p);
623+static void wglob_free_pattern(struct wglob_pattern *c);
624+static void wglob_free_pattern_vp(void *c);
625+
626+static inline struct wglob_stack *wglob_stack_new(
627+ const struct wglob_search *s, const struct wglob_stack *prev)
628+ __attribute__((nonnull(1)));
589629 static void wglob_search(
590- const struct wglob_pattern *restrict pattern,
591- enum wglobflags_T flags,
592- xstrbuf_T *restrict path,
593- xwcsbuf_T *restrict wpath,
594- plist_T *restrict list)
630+ struct wglob_search *restrict s, struct wglob_stack *restrict t)
595631 __attribute__((nonnull));
596-static void wglob_search_literal(
597- const struct wglob_pattern *restrict pattern,
598- enum wglobflags_T flags,
599- xstrbuf_T *restrict path,
600- xwcsbuf_T *restrict wpath,
601- plist_T *restrict list)
632+static void wglob_search_literal_each(
633+ struct wglob_search *restrict s, const struct wglob_stack *restrict t)
602634 __attribute__((nonnull));
603-static void wglob_search_match(
604- const struct wglob_pattern *restrict pattern,
605- enum wglobflags_T flags,
606- xstrbuf_T *restrict path,
607- xwcsbuf_T *restrict wpath,
608- plist_T *restrict list)
635+static void wglob_add_result(
636+ struct wglob_search *s, bool only_if_existing, bool markdir)
609637 __attribute__((nonnull));
610-static void wglob_search_recsearch(
611- const struct wglob_pattern *restrict pattern,
612- enum wglobflags_T flags,
613- xstrbuf_T *restrict path,
614- xwcsbuf_T *restrict wpath,
615- plist_T *restrict list,
616- struct wglob_dirstack *dirstack)
617- __attribute__((nonnull(1,3,4,5)));
618-static bool is_reentry(
619- const struct stat *st, const struct wglob_dirstack *dirstack)
620- __attribute__((nonnull(1)));
638+static void wglob_search_literal_uniq(
639+ struct wglob_search *restrict s, struct wglob_stack *restrict t)
640+ __attribute__((nonnull));
641+static bool wglob_scandir(
642+ struct wglob_search *restrict s, const struct wglob_stack *restrict t)
643+ __attribute__((nonnull));
644+static void wglob_scandir_entry(
645+ const char *name, struct wglob_search *restrict s,
646+ const struct wglob_stack *restrict t, struct wglob_stack *restrict t2,
647+ bool only_if_existing)
648+ __attribute__((nonnull));
649+static bool wglob_should_recurse(
650+ const char *restrict name, const char *restrict path,
651+ const struct wglob_pattern *restrict c, struct wglob_stack *restrict t,
652+ size_t count)
653+ __attribute__((nonnull));
654+static bool wglob_is_reentry(const struct wglob_stack *const t, size_t count)
655+ __attribute__((nonnull,pure));
656+
621657 static int wglob_sortcmp(const void *v1, const void *v2)
622658 __attribute__((pure,nonnull));
623-static size_t remove_dups(void **array)
624- __attribute__((pure,nonnull));
625659
626660 /* A wide string version of `glob'.
627661 * Adds all pathnames that matches the specified pattern to the specified list.
@@ -643,101 +677,91 @@
643677 bool wglob(const wchar_t *restrict pattern, enum wglobflags_T flags,
644678 plist_T *restrict list)
645679 {
646- size_t listbase = list->length, patternsize = add(wcslen(pattern), 1);
647- xstrbuf_T path;
648- xwcsbuf_T wpath;
649- struct wglob_pattern *p;
650- wchar_t savepattern[patternsize];
680+ struct wglob_search s;
681+ size_t listbase = list->length;
651682
652- p = wglob_parse_pattern(wcscpy(savepattern, pattern), flags);
653- if (p == NULL)
683+ s.pattern = wglob_parse_pattern(pattern, flags);
684+ if (s.pattern.length == 0) {
685+ pl_destroy(&s.pattern);
654686 return false;
687+ }
655688
656- sb_initwithmax(&path, patternsize);
657- wb_initwithmax(&wpath, patternsize);
658- wglob_search(p, flags, &path, &wpath, list);
659- sb_destroy(&path);
660- wb_destroy(&wpath);
661- wglob_free_pattern(p);
689+ s.flags = flags;
690+ sb_init(&s.path);
691+ wb_init(&s.wpath);
692+ s.results = list;
662693
694+ struct wglob_stack *t = wglob_stack_new(&s, NULL);
695+ t->active_components[0] = 1;
696+
697+ wglob_search(&s, t);
698+
699+ free(t);
700+
701+ sb_destroy(&s.path);
702+ wb_destroy(&s.wpath);
703+ plfree(pl_toary(&s.pattern), wglob_free_pattern_vp);
704+
663705 if (!(flags & WGLB_NOSORT)) {
664706 size_t count = list->length - listbase; /* # of resulting items */
665- if (count > 0) {
666- /* sort the items */
667- qsort(list->contents + listbase, count, sizeof (void *),
668- wglob_sortcmp);
669-
670- /* remove duplicates */
671- list->length -= remove_dups(list->contents + listbase);
672- }
707+ qsort(list->contents + listbase, count, sizeof (void *), wglob_sortcmp);
673708 }
674709 return !is_interrupted();
675710 }
676711
677712 /* Parses the specified pattern.
678- * Pattern `pat' may be modified in this function and must not be changed until
679- * the return value is freed by `wglob_free_pattern'.
713+ * The result is a pointer list of newly malloced `struct wglob_pattern's.
680714 * WGLB_CASEFOLD, WGLB_PERIOD and WGLB_RECDIR in `flags' affect the results. */
681-struct wglob_pattern *wglob_parse_pattern(wchar_t *pat, enum wglobflags_T flags)
715+plist_T wglob_parse_pattern(const wchar_t *pattern, enum wglobflags_T flags)
682716 {
683- struct wglob_pattern *result = NULL, **lastp = &result;
684- struct wglob_pattern *p;
717+ plist_T components;
718+ pl_init(&components);
685719
686720 for (;;) {
687- wchar_t *slash = wcschr(pat, L'/');
688- if (slash != NULL) {
689- slash[0] = L'\0';
690- if (!(flags & WGLB_RECDIR))
691- goto normal;
692- if (wcscmp(pat, L"**") == 0)
693- p = wglob_create_recsearch_pattern(false, false);
694- else if (wcscmp(pat, L"***") == 0)
695- p = wglob_create_recsearch_pattern(true, false);
696- else if (wcscmp(pat, L".**") == 0)
697- p = wglob_create_recsearch_pattern(false, true);
698- else if (wcscmp(pat, L".***") == 0)
699- p = wglob_create_recsearch_pattern(true, true);
700- else
701- goto normal;
702- } else {
703-normal:
704- p = wglob_parse_pattern_part(pat, flags);
721+ const wchar_t *slash = wcschr(pattern, L'/');
722+ size_t componentlength =
723+ (slash != NULL) ? (size_t) (slash - pattern) : wcslen(pattern);
724+ wchar_t component[componentlength + 1];
725+ wcsncpy(component, pattern, componentlength);
726+ component[componentlength] = L'\0';
727+
728+ struct wglob_pattern *c =
729+ wglob_parse_component(component, flags, slash != NULL);
730+ if (c == NULL) {
731+ pl_clear(&components, wglob_free_pattern_vp);
732+ break;
705733 }
706- if (p == NULL)
707- goto fail;
708- *lastp = p;
709- lastp = &p->next;
734+ pl_add(&components, c);
735+
710736 if (slash == NULL)
711- return result;
712- pat = &slash[1];
737+ break;
738+ pattern = &slash[1];
713739 }
714740
715-fail:
716- wglob_free_pattern(result);
717- return NULL;
741+ return components;
718742 }
719743
720-struct wglob_pattern *wglob_create_recsearch_pattern(
721- bool followlink, bool allowperiod)
744+/* Parses one pattern component.
745+ * `has_next' must be true if the component is not the last one. */
746+struct wglob_pattern *wglob_parse_component(
747+ const wchar_t *pattern, enum wglobflags_T flags, bool has_next)
722748 {
723- struct wglob_pattern *result = xmalloc(sizeof *result);
724- result->next = NULL;
725- result->type = WGLOB_RECSEARCH;
726- result->value.recsearch.followlink = followlink;
727- result->value.recsearch.allowperiod = allowperiod;
728- return result;
729-}
749+ assert(!wcschr(pattern, L'/'));
730750
731-/* Parses the specified pattern that contains one pathname component.
732- * `pat' must not contain a slash. */
733-struct wglob_pattern *wglob_parse_pattern_part(
734- wchar_t *pat, enum wglobflags_T flags)
735-{
751+ if (has_next && (flags & WGLB_RECDIR)) {
752+ if (wcscmp(pattern, L"**") == 0)
753+ return wglob_create_recsearch_component(false, false);
754+ if (wcscmp(pattern, L"***") == 0)
755+ return wglob_create_recsearch_component(true, false);
756+ if (wcscmp(pattern, L".**") == 0)
757+ return wglob_create_recsearch_component(false, true);
758+ if (wcscmp(pattern, L".***") == 0)
759+ return wglob_create_recsearch_component(true, true);
760+ }
761+
736762 struct wglob_pattern *result = xmalloc(sizeof *result);
737- result->next = NULL;
738763
739- assert(!wcschr(pat, L'/'));
740- if (is_matching_pattern(pat)) {
764+ if (is_matching_pattern(pattern)) {
741765 xfnmflags_T xflags = XFNM_HEADONLY | XFNM_TAILONLY;
742766 if (flags & WGLB_CASEFOLD)
743767 xflags |= XFNM_CASEFOLD;
@@ -744,15 +768,14 @@
744768 if (!(flags & WGLB_PERIOD))
745769 xflags |= XFNM_PERIOD;
746770 result->type = WGLOB_MATCH;
747- result->value.match.pattern = xfnm_compile(pat, xflags);
771+ result->value.match.pattern = xfnm_compile(pattern, xflags);
748772 if (result->value.match.pattern == NULL)
749773 goto fail;
750774 } else {
751- wchar_t *value = unescape(pat);
752- assert(wcslen(value) <= wcslen(pat));
775+ wchar_t *value = unescape(pattern);
753776 result->type = WGLOB_LITERAL;
754- result->value.literal.wname = wcscpy(pat, value);
755- result->value.literal.name = realloc_wcstombs(value);
777+ result->value.literal.wname = value;
778+ result->value.literal.name = malloc_wcstombs(value);
756779 if (result->value.literal.name == NULL)
757780 goto fail;
758781 }
@@ -759,196 +782,344 @@
759782 return result;
760783
761784 fail:
762- free(result);
785+ wglob_free_pattern(result);
763786 return NULL;
764787 }
765788
766-void wglob_free_pattern(struct wglob_pattern *p)
789+struct wglob_pattern *wglob_create_recsearch_component(
790+ bool followlink, bool allowperiod)
767791 {
768- while (p != NULL) {
769- struct wglob_pattern *next = p->next;
770- switch (p->type) {
771- case WGLOB_LITERAL:
772- free(p->value.literal.name);
773- break;
774- case WGLOB_MATCH:
775- xfnm_free(p->value.match.pattern);
776- break;
777- case WGLOB_RECSEARCH:
778- break;
779- }
780- free(p);
781- p = next;
782- }
792+ struct wglob_pattern *result = xmalloc(sizeof *result);
793+ result->type = WGLOB_RECSEARCH;
794+ result->value.recsearch.followlink = followlink;
795+ result->value.recsearch.allowperiod = allowperiod;
796+ return result;
783797 }
784798
785-/* Searches the directory designated in `path' and add matching pathnames to
786- * `list'.
787- * `path' and `wpath' must contain the same pathname, which must be empty or
788- * end with a slash. The contents of `path' and `wpath' may be changed during
789- * the search, but when the function returns the contents are restored to the
790- * original value.
791- * Only the WGLB_MARK flag in `flags' affects the results. */
792-void wglob_search(
793- const struct wglob_pattern *restrict pattern,
794- enum wglobflags_T flags,
795- xstrbuf_T *restrict path,
796- xwcsbuf_T *restrict wpath,
797- plist_T *restrict list)
799+void wglob_free_pattern(struct wglob_pattern *c)
798800 {
799- assert(path->length == 0 || path->contents[path->length - 1] == '/');
800- assert(wpath->length == 0 || wpath->contents[wpath->length - 1] == L'/');
801- switch (pattern->type) {
801+ if (c == NULL)
802+ return;
803+
804+ switch (c->type) {
802805 case WGLOB_LITERAL:
803- wglob_search_literal(pattern, flags, path, wpath, list);
806+ free(c->value.literal.name);
807+ free(c->value.literal.wname);
804808 break;
805809 case WGLOB_MATCH:
806- wglob_search_match(pattern, flags, path, wpath, list);
810+ xfnm_free(c->value.match.pattern);
807811 break;
808812 case WGLOB_RECSEARCH:
809- wglob_search_recsearch(pattern, flags, path, wpath, list, NULL);
810813 break;
811814 }
815+ free(c);
812816 }
813817
814-/* Searches for the pathname component that does not require pattern matching.*/
815-void wglob_search_literal(
816- const struct wglob_pattern *restrict pattern,
817- enum wglobflags_T flags,
818- xstrbuf_T *restrict path,
819- xwcsbuf_T *restrict wpath,
820- plist_T *restrict list)
818+void wglob_free_pattern_vp(void *c)
821819 {
822- const size_t savepathlen = path->length;
823- const size_t savewpathlen = wpath->length;
820+ wglob_free_pattern(c);
821+}
824822
825- assert(pattern->type == WGLOB_LITERAL);
826- if (pattern->next == NULL) {
827- struct stat st;
828- sb_cat(path, pattern->value.literal.name);
829- if (stat(path->contents, &st) >= 0) {
830- if (pattern->value.literal.wname[0] != L'\0') {
831- wb_cat(wpath, pattern->value.literal.wname);
832- if ((flags & WGLB_MARK) && S_ISDIR(st.st_mode))
833- wb_wccat(wpath, L'/');
834- }
835- pl_add(list, xwcsdup(wpath->contents));
823+/* Allocates a new wglob_stack entry with no active components.
824+ * Note that `st' is uninitialized. */
825+struct wglob_stack *wglob_stack_new(
826+ const struct wglob_search *s, const struct wglob_stack *prev)
827+{
828+ struct wglob_stack *t =
829+ xmallocs(sizeof *t, sizeof *t->active_components, s->pattern.length);
830+ t->prev = prev;
831+ memset(t->active_components, 0, s->pattern.length);
832+ return t;
833+}
834+
835+/* Searches the directory `s->path' and adds matching pathnames to `s->results'.
836+ * `s->path' and `s->wpath' must contain the same pathname, which must be empty
837+ * or end with a slash. The contents of `s->path' and `s->wpath' may be changed
838+ * during the search, but when the function returns the contents are restored to
839+ * the original value.
840+ * Only the WGLB_MARK flag in `s->flags' affects the results. */
841+void wglob_search(
842+ struct wglob_search *restrict s, struct wglob_stack *restrict t)
843+{
844+ assert(s->path.length == 0 ||
845+ s->path.contents[s->path.length - 1] == '/');
846+ assert(s->wpath.length == 0 ||
847+ s->wpath.contents[s->wpath.length - 1] == L'/');
848+
849+ if (is_interrupted())
850+ return;
851+
852+ /* find active WGLOB_RECSEARCH components and activate their next component
853+ * to allow the WGLOB_RECSEARCH component to match nothing. */
854+ for (size_t i = 0; i < s->pattern.length; i++) {
855+ if (!t->active_components[i])
856+ continue;
857+
858+ const struct wglob_pattern *c = s->pattern.contents[i];
859+ if (c->type == WGLOB_RECSEARCH) {
860+ assert(i + 1 < s->pattern.length);
861+ t->active_components[i + 1] = t->active_components[i];
836862 }
837- } else {
838- sb_ccat(sb_cat(path, pattern->value.literal.name), '/');
839- wb_wccat(wb_cat(wpath, pattern->value.literal.wname), L'/');
840- wglob_search(pattern->next, flags, path, wpath, list);
841863 }
842864
843- sb_truncate(path, savepathlen);
844- wb_truncate(wpath, savewpathlen);
865+ /* count active components */
866+ size_t active_literals = 0, active_matchers = 0;
867+ for (size_t i = 0; i < s->pattern.length; i++) {
868+ if (!t->active_components[i])
869+ continue;
870+
871+ const struct wglob_pattern *c = s->pattern.contents[i];
872+ if (c->type == WGLOB_LITERAL)
873+ active_literals++;
874+ else
875+ active_matchers++;
876+ }
877+
878+ /* perform search depending on the type of active components */
879+ if (active_matchers > 0)
880+ if (wglob_scandir(s, t))
881+ return;
882+ if (active_literals == 1)
883+ wglob_search_literal_each(s, t);
884+ else if (active_literals > 0)
885+ wglob_search_literal_uniq(s, t);
845886 }
846887
847-/* Searches the directory for files that match with the specified pattern. */
848-void wglob_search_match(
849- const struct wglob_pattern *restrict pattern,
850- enum wglobflags_T flags,
851- xstrbuf_T *restrict path,
852- xwcsbuf_T *restrict wpath,
853- plist_T *restrict list)
888+/* Applies active components to the current directory path and continues
889+ * searching subdirectories.
890+ * This function ignores active components that are not literal.
891+ * This function must be used only when there is only one literal active
892+ * component. If more than one component are active, duplicate results may be
893+ * produced for the same pathname. */
894+void wglob_search_literal_each(
895+ struct wglob_search *restrict s, const struct wglob_stack *restrict t)
854896 {
855- assert(pattern->type == WGLOB_MATCH);
897+ size_t savepathlen = s->path.length, savewpathlen = s->wpath.length;
856898
857- if (is_interrupted())
858- return;
899+ for (size_t i = 0; i < s->pattern.length; i++) {
900+ if (!t->active_components[i])
901+ continue;
859902
860- DIR *dir = opendir((path->length == 0) ? "." : path->contents);
861- if (dir == NULL)
862- return;
903+ const struct wglob_pattern *c = s->pattern.contents[i];
904+ if (c->type != WGLOB_LITERAL)
905+ continue;
863906
864- const size_t savepathlen = path->length;
865- const size_t savewpathlen = wpath->length;
866- struct dirent *de;
867- while ((de = readdir(dir)) != NULL) {
868- if (xfnm_match(pattern->value.match.pattern, de->d_name) == 0) {
869- if (wb_mbscat(wpath, de->d_name) != NULL)
870- goto next;
871- sb_cat(path, de->d_name);
872- if (pattern->next == NULL) {
873- if (flags & WGLB_MARK) {
874- if (is_directory(path->contents))
875- wb_wccat(wpath, L'/');
876- }
877- pl_add(list, xwcsdup(wpath->contents));
878- } else {
879- sb_ccat(path, '/');
880- wb_wccat(wpath, L'/');
881- wglob_search(pattern->next, flags, path, wpath, list);
882- }
883-next:
884- sb_truncate(path, savepathlen);
885- wb_truncate(wpath, savewpathlen);
907+ sb_cat(&s->path, c->value.literal.name);
908+ wb_cat(&s->wpath, c->value.literal.wname);
909+
910+ if (i + 1 < s->pattern.length) {
911+ /* There is a next component. */
912+ struct wglob_stack *t2 = wglob_stack_new(s, t);
913+ t2->active_components[i + 1] = 1;
914+
915+ sb_ccat(&s->path, '/');
916+ wb_wccat(&s->wpath, L'/');
917+
918+ wglob_search(s, t2);
919+
920+ free(t2);
921+ } else {
922+ /* This is the last component. */
923+ wglob_add_result(s, true, false);
886924 }
925+
926+ sb_truncate(&s->path, savepathlen);
927+ wb_truncate(&s->wpath, savewpathlen);
887928 }
888- closedir(dir);
889929 }
890930
891-/* Searches the directory recursively for files that match with the specified
892- * pattern. */
893-void wglob_search_recsearch(
894- const struct wglob_pattern *restrict pattern,
895- enum wglobflags_T flags,
896- xstrbuf_T *restrict path,
897- xwcsbuf_T *restrict wpath,
898- plist_T *restrict list,
899- struct wglob_dirstack *dirstack)
931+/* Adds `s->path' to `s->results'. */
932+void wglob_add_result(
933+ struct wglob_search *s, bool only_if_existing, bool markdir)
900934 {
901- assert(pattern->type == WGLOB_RECSEARCH);
902- assert(pattern->next != NULL);
935+ if (!only_if_existing && !markdir) {
936+ pl_add(s->results, xwcsdup(s->wpath.contents));
937+ return;
938+ }
903939
904- if (is_interrupted())
940+ struct stat st;
941+ bool existing = stat(s->path.contents, &st) >= 0;
942+ if (only_if_existing && !existing)
905943 return;
944+ if (!markdir || !existing || !S_ISDIR(st.st_mode)) {
945+ pl_add(s->results, xwcsdup(s->wpath.contents));
946+ return;
947+ }
906948
907- /* Step 1: search `path' itself */
908- wglob_search(pattern->next, flags, path, wpath, list);
949+ size_t length = add(wcslen(s->wpath.contents), 1);
950+ xwcsbuf_T result;
951+ wb_initwithmax(&result, length);
952+ wb_cat(&result, s->wpath.contents);
953+ wb_wccat(&result, L'/');
954+ pl_add(s->results, wb_towcs(&result));
955+}
909956
910- /* Step 2: search the subdirectories of `path' recursively */
911- DIR *dir = opendir((path->length == 0) ? "." : path->contents);
957+/* Applies active components to the current directory path and continues
958+ * searching subdirectories.
959+ * This function ignores active components that are not literal. */
960+void wglob_search_literal_uniq(
961+ struct wglob_search *restrict s, struct wglob_stack *restrict t)
962+{
963+ /* collect names of active literal components */
964+ hashtable_T ac;
965+ ht_initwithcapacity(&ac, hashwcs, htwcscmp, s->pattern.length);
966+ for (size_t i = 0; i < s->pattern.length; i++) {
967+ if (!t->active_components[i])
968+ continue;
969+
970+ const struct wglob_pattern *c = s->pattern.contents[i];
971+ if (c->type != WGLOB_LITERAL)
972+ continue;
973+
974+ ht_set(&ac, c->value.literal.wname, c);
975+ }
976+
977+ kvpair_T *names = ht_tokvarray(&ac);
978+ ht_destroy(&ac);
979+
980+ /* descend down for each name */
981+ struct wglob_stack *t2 = wglob_stack_new(s, t);
982+ for (const kvpair_T *n = names; n->key != NULL; n++) {
983+ const struct wglob_pattern *c = n->value;
984+ memset(t2->active_components, 0, s->pattern.length);
985+ wglob_scandir_entry(c->value.literal.name, s, t, t2, true);
986+ }
987+
988+ free(t2);
989+ free(names);
990+}
991+
992+/* Applies active components to the current directory path and continues
993+ * searching subdirectories.
994+ * Returns true if the directory could be searched. */
995+bool wglob_scandir(
996+ struct wglob_search *restrict s, const struct wglob_stack *restrict t)
997+{
998+ DIR* dir = opendir((s->path.length == 0) ? "." : s->path.contents);
912999 if (dir == NULL)
913- return;
1000+ return false;
9141001
915- const size_t savepathlen = path->length;
916- const size_t savewpathlen = wpath->length;
1002+ struct wglob_stack *t2 = wglob_stack_new(s, t);
1003+
1004+ /* An empty name, which is needed for empty literal components, must be
1005+ * explicitly produced as it would never be returned from readdir. */
1006+ wglob_scandir_entry("", s, t, t2, true);
1007+
1008+ /* now try each directory entry */
9171009 struct dirent *de;
918- int (*statfunc)(const char *path, struct stat *st) =
919- pattern->value.recsearch.followlink ? stat : lstat;
9201010 while ((de = readdir(dir)) != NULL) {
921- if (pattern->value.recsearch.allowperiod
922- ? strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0
923- : de->d_name[0] == '.')
1011+ memset(t2->active_components, 0, s->pattern.length);
1012+ wglob_scandir_entry(de->d_name, s, t, t2, false);
1013+ }
1014+ closedir(dir);
1015+
1016+ free(t2);
1017+ return true;
1018+}
1019+
1020+/* Checks if each active component matches the given `name' in the current
1021+ * directory path and continues searching subdirectories.
1022+ * `t' is the stack frame for the current directory path and `t2' for the next
1023+ * frame. `t2->prev' must be `t' and `t2->active_components' must have been
1024+ * zeroed.
1025+ * `only_if_existing' is passed to `wglob_add_result' and should be false iff
1026+ * the `name' is known to be an existing file. */
1027+void wglob_scandir_entry(
1028+ const char *name, struct wglob_search *restrict s,
1029+ const struct wglob_stack *restrict t, struct wglob_stack *restrict t2,
1030+ bool only_if_existing)
1031+{
1032+ size_t savepathlen = s->path.length, savewpathlen = s->wpath.length;
1033+
1034+ sb_cat(&s->path, name);
1035+ if (wb_mbscat(&s->wpath, name) != NULL)
1036+ goto done; // skip on error
1037+
1038+ /* add new active components to `t2' */
1039+ for (size_t i = 0; i < s->pattern.length; i++) {
1040+ if (!t->active_components[i])
9241041 continue;
9251042
926- struct wglob_dirstack newstack;
927- sb_cat(path, de->d_name);
928- if (statfunc(path->contents, &newstack.st) >= 0
929- && S_ISDIR(newstack.st.st_mode)
930- && !is_reentry(&newstack.st, dirstack)) {
931-
932- if (wb_mbscat(wpath, de->d_name) != NULL)
933- goto next;
934- sb_ccat(path, '/');
935- wb_wccat(wpath, L'/');
936- newstack.prev = dirstack;
937- wglob_search_recsearch(
938- pattern, flags, path, wpath, list, &newstack);
1043+ const struct wglob_pattern *c = s->pattern.contents[i];
1044+ switch (c->type) {
1045+ case WGLOB_LITERAL:
1046+ if (strcmp(c->value.literal.name, name) != 0)
1047+ continue;
1048+ if (i + 1 < s->pattern.length) // has a next component?
1049+ t2->active_components[i + 1] = 1;
1050+ else
1051+ wglob_add_result(s, only_if_existing, false);
1052+ break;
1053+ case WGLOB_MATCH:
1054+ if (name[0] == '\0')
1055+ continue;
1056+ if (xfnm_match(c->value.match.pattern, name) != 0)
1057+ continue;
1058+ if (i + 1 < s->pattern.length) // has a next component?
1059+ t2->active_components[i + 1] = 1;
1060+ else
1061+ wglob_add_result(s, only_if_existing, s->flags & WGLB_MARK);
1062+ break;
1063+ case WGLOB_RECSEARCH:
1064+ assert(i + 1 < s->pattern.length);
1065+ if (name[0] == '\0')
1066+ continue;
1067+ if (t2->active_components[i] == 0) {
1068+ const char *path = s->path.contents;
1069+ size_t count = t->active_components[i] - 1;
1070+ if (wglob_should_recurse(name, path, c, t2, count))
1071+ t2->active_components[i] = t->active_components[i] + 1;
1072+ }
1073+ break;
9391074 }
940-next:
941- sb_truncate(path, savepathlen);
942- wb_truncate(wpath, savewpathlen);
9431075 }
944- closedir(dir);
1076+
1077+ sb_ccat(&s->path, '/');
1078+ wb_wccat(&s->wpath, L'/');
1079+
1080+ /* descend down to the next subdirectory */
1081+ wglob_search(s, t2);
1082+
1083+done:
1084+ sb_truncate(&s->path, savepathlen);
1085+ wb_truncate(&s->wpath, savewpathlen);
9451086 }
9461087
947-/* Returns true iff the file designated by `st' is contained in `dirstack'. */
948-bool is_reentry(const struct stat *st, const struct wglob_dirstack *dirstack)
1088+/* Decides if we should continue recursion on this component.
1089+ * In this function, `t->st' is updated to the result of `stat'ing the `path'.
1090+ */
1091+bool wglob_should_recurse(
1092+ const char *restrict name, const char *restrict path,
1093+ const struct wglob_pattern *restrict c, struct wglob_stack *restrict t,
1094+ size_t count)
9491095 {
950- for (; dirstack != NULL; dirstack = dirstack->prev)
951- if (stat_result_same_file(st, &dirstack->st))
1096+ if (c->value.recsearch.allowperiod) {
1097+ if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
1098+ return false;
1099+ } else {
1100+ if (name[0] == '.')
1101+ return false;
1102+ }
1103+
1104+ int (*statfunc)(const char *path, struct stat *st) =
1105+ c->value.recsearch.followlink ? stat : lstat;
1106+ if (statfunc(path, &t->st) < 0)
1107+ return false;
1108+ if (!S_ISDIR(t->st.st_mode))
1109+ return false;
1110+ if (wglob_is_reentry(t, count))
1111+ return false;
1112+ return true;
1113+}
1114+
1115+/* Returns true iff a file that is the same as `t->st' appears in `count'
1116+ * ancestors of `t'. */
1117+bool wglob_is_reentry(const struct wglob_stack *const t, size_t count)
1118+{
1119+ for (const struct wglob_stack *t2 = t->prev;
1120+ t2 != NULL && count > 0;
1121+ t2 = t2->prev, count--)
1122+ if (stat_result_same_file(&t->st, &t2->st))
9521123 return true;
9531124 return false;
9541125 }
@@ -958,26 +1129,7 @@
9581129 return wcscoll(*(const wchar_t *const *) v1, *(const wchar_t *const *) v2);
9591130 }
9601131
961-/* Given a NULL-terminated array of pointers to wide strings, removes its
962- * elements so that no adjacent elements compare equal. The removed elements are
963- * freed in this function. Returns the number of removed elements. */
964-size_t remove_dups(void **array)
965-{
966- if (array[0] == NULL)
967- return 0;
9681132
969- size_t in = 1, out = 0;
970- for (; array[in] != NULL; in++) {
971- if (wcscmp(array[in], array[out]) == 0)
972- free(array[in]);
973- else
974- array[++out] = array[in];
975- }
976- array[++out] = NULL;
977- return in - out;
978-}
979-
980-
9811133 /********** Built-ins **********/
9821134
9831135 static wchar_t *canonicalize_path(const wchar_t *path)
Show on old repository browser