From 170260ae90cb6a0fec476e7d970e3ac3e81e98f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= <pclouds@gmail.com> Date: Sun, 18 Nov 2012 16:13:06 +0700 Subject: [PATCH 1/4] pathspec: save the non-wildcard length part MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We mark pathspec with wildcards with the field use_wildcard. We could do better by saving the length of the non-wildcard part, which can be used for optimizations such as f9f6e2c (exclude: do strcmp as much as possible before fnmatch - 2012-06-07). Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- builtin/ls-files.c | 2 +- builtin/ls-tree.c | 2 +- cache.h | 2 +- dir.c | 6 +++--- tree-walk.c | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/builtin/ls-files.c b/builtin/ls-files.c index b5434af0c87..4a9ee690c7d 100644 --- a/builtin/ls-files.c +++ b/builtin/ls-files.c @@ -337,7 +337,7 @@ void overlay_tree_on_cache(const char *tree_name, const char *prefix) matchbuf[0] = prefix; matchbuf[1] = NULL; init_pathspec(&pathspec, matchbuf); - pathspec.items[0].use_wildcard = 0; + pathspec.items[0].nowildcard_len = pathspec.items[0].len; } else init_pathspec(&pathspec, NULL); if (read_tree(tree, 1, &pathspec)) diff --git a/builtin/ls-tree.c b/builtin/ls-tree.c index 235c17cc015..fb76e38d849 100644 --- a/builtin/ls-tree.c +++ b/builtin/ls-tree.c @@ -168,7 +168,7 @@ int cmd_ls_tree(int argc, const char **argv, const char *prefix) init_pathspec(&pathspec, get_pathspec(prefix, argv + 1)); for (i = 0; i < pathspec.nr; i++) - pathspec.items[i].use_wildcard = 0; + pathspec.items[i].nowildcard_len = pathspec.items[i].len; pathspec.has_wildcard = 0; tree = parse_tree_indirect(sha1); if (!tree) diff --git a/cache.h b/cache.h index dbd8018b582..bf031f133c9 100644 --- a/cache.h +++ b/cache.h @@ -482,7 +482,7 @@ struct pathspec { struct pathspec_item { const char *match; int len; - unsigned int use_wildcard:1; + int nowildcard_len; } *items; }; diff --git a/dir.c b/dir.c index 5a83aa7897f..c391d46b2fc 100644 --- a/dir.c +++ b/dir.c @@ -230,7 +230,7 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix, return MATCHED_RECURSIVELY; } - if (item->use_wildcard && !fnmatch(match, name, 0)) + if (item->nowildcard_len < item->len && !fnmatch(match, name, 0)) return MATCHED_FNMATCH; return 0; @@ -1429,8 +1429,8 @@ int init_pathspec(struct pathspec *pathspec, const char **paths) item->match = path; item->len = strlen(path); - item->use_wildcard = !no_wildcard(path); - if (item->use_wildcard) + item->nowildcard_len = simple_length(path); + if (item->nowildcard_len < item->len) pathspec->has_wildcard = 1; } diff --git a/tree-walk.c b/tree-walk.c index 3f54c02d762..af871c50d1a 100644 --- a/tree-walk.c +++ b/tree-walk.c @@ -626,7 +626,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry, &never_interesting)) return entry_interesting; - if (item->use_wildcard) { + if (item->nowildcard_len < item->len) { if (!fnmatch(match + baselen, entry->path, 0)) return entry_interesting; @@ -642,7 +642,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry, } match_wildcards: - if (!item->use_wildcard) + if (item->nowildcard_len == item->len) continue; /* From 5d74762d87859d08fdf4c87ec9387e31e097acba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= <pclouds@gmail.com> Date: Sat, 24 Nov 2012 11:33:49 +0700 Subject: [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- dir.c | 18 +++++++++++++++++- dir.h | 8 ++++++++ tree-walk.c | 6 ++++-- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/dir.c b/dir.c index c391d46b2fc..f81e1d29175 100644 --- a/dir.c +++ b/dir.c @@ -34,6 +34,21 @@ int fnmatch_icase(const char *pattern, const char *string, int flags) return fnmatch(pattern, string, flags | (ignore_case ? FNM_CASEFOLD : 0)); } +inline int git_fnmatch(const char *pattern, const char *string, + int flags, int prefix) +{ + int fnm_flags = 0; + if (flags & GFNM_PATHNAME) + fnm_flags |= FNM_PATHNAME; + if (prefix > 0) { + if (strncmp(pattern, string, prefix)) + return FNM_NOMATCH; + pattern += prefix; + string += prefix; + } + return fnmatch(pattern, string, fnm_flags); +} + static size_t common_prefix_len(const char **pathspec) { const char *n, *first; @@ -230,7 +245,8 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix, return MATCHED_RECURSIVELY; } - if (item->nowildcard_len < item->len && !fnmatch(match, name, 0)) + if (item->nowildcard_len < item->len && + !git_fnmatch(match, name, 0, item->nowildcard_len - prefix)) return MATCHED_FNMATCH; return 0; diff --git a/dir.h b/dir.h index f5c89e3b801..0e8ae84628c 100644 --- a/dir.h +++ b/dir.h @@ -139,4 +139,12 @@ extern int strcmp_icase(const char *a, const char *b); extern int strncmp_icase(const char *a, const char *b, size_t count); extern int fnmatch_icase(const char *pattern, const char *string, int flags); +/* + * The prefix part of pattern must not contains wildcards. + */ +#define GFNM_PATHNAME 1 /* similar to FNM_PATHNAME */ + +extern int git_fnmatch(const char *pattern, const char *string, + int flags, int prefix); + #endif diff --git a/tree-walk.c b/tree-walk.c index af871c50d1a..2fcf3c02da2 100644 --- a/tree-walk.c +++ b/tree-walk.c @@ -627,7 +627,8 @@ enum interesting tree_entry_interesting(const struct name_entry *entry, return entry_interesting; if (item->nowildcard_len < item->len) { - if (!fnmatch(match + baselen, entry->path, 0)) + if (!git_fnmatch(match + baselen, entry->path, + 0, item->nowildcard_len - baselen)) return entry_interesting; /* @@ -652,7 +653,8 @@ match_wildcards: strbuf_add(base, entry->path, pathlen); - if (!fnmatch(match, base->buf + base_offset, 0)) { + if (!git_fnmatch(match, base->buf + base_offset, + 0, item->nowildcard_len)) { strbuf_setlen(base, base_offset + baselen); return entry_interesting; } From 8c6abbcd2720dd80b1c9b6db8104c4dcc7cba6c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= <pclouds@gmail.com> Date: Sat, 24 Nov 2012 11:33:50 +0700 Subject: [PATCH 3/4] pathspec: apply "*.c" optimization from exclude MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a pattern contains only a single asterisk as wildcard, e.g. "foo*bar", after literally comparing the leading part "foo" with the string, we can compare the tail of the string and make sure it matches "bar", instead of running fnmatch() on "*bar" against the remainder of the string. -O2 build on linux-2.6, without the patch: $ time git rev-list --quiet HEAD -- '*.c' real 0m40.770s user 0m40.290s sys 0m0.256s With the patch $ time ~/w/git/git rev-list --quiet HEAD -- '*.c' real 0m34.288s user 0m33.997s sys 0m0.205s The above command is not supposed to be widely popular. It's chosen because it exercises pathspec matching a lot. The point is it cuts down matching time for popular patterns like *.c, which could be used as pathspec in other places. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- cache.h | 3 +++ dir.c | 18 ++++++++++++++++-- dir.h | 1 + tree-walk.c | 6 ++++-- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cache.h b/cache.h index bf031f133c9..babf9e54c90 100644 --- a/cache.h +++ b/cache.h @@ -473,6 +473,8 @@ extern int index_name_is_other(const struct index_state *, const char *, int); extern int ie_match_stat(const struct index_state *, struct cache_entry *, struct stat *, unsigned int); extern int ie_modified(const struct index_state *, struct cache_entry *, struct stat *, unsigned int); +#define PATHSPEC_ONESTAR 1 /* the pathspec pattern sastisfies GFNM_ONESTAR */ + struct pathspec { const char **raw; /* get_pathspec() result, not freed by free_pathspec() */ int nr; @@ -483,6 +485,7 @@ struct pathspec { const char *match; int len; int nowildcard_len; + int flags; } *items; }; diff --git a/dir.c b/dir.c index f81e1d29175..9afd388604c 100644 --- a/dir.c +++ b/dir.c @@ -46,6 +46,13 @@ inline int git_fnmatch(const char *pattern, const char *string, pattern += prefix; string += prefix; } + if (flags & GFNM_ONESTAR) { + int pattern_len = strlen(++pattern); + int string_len = strlen(string); + return string_len < pattern_len || + strcmp(pattern, + string + string_len - pattern_len); + } return fnmatch(pattern, string, fnm_flags); } @@ -246,7 +253,9 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix, } if (item->nowildcard_len < item->len && - !git_fnmatch(match, name, 0, item->nowildcard_len - prefix)) + !git_fnmatch(match, name, + item->flags & PATHSPEC_ONESTAR ? GFNM_ONESTAR : 0, + item->nowildcard_len - prefix)) return MATCHED_FNMATCH; return 0; @@ -1446,8 +1455,13 @@ int init_pathspec(struct pathspec *pathspec, const char **paths) item->match = path; item->len = strlen(path); item->nowildcard_len = simple_length(path); - if (item->nowildcard_len < item->len) + item->flags = 0; + if (item->nowildcard_len < item->len) { pathspec->has_wildcard = 1; + if (path[item->nowildcard_len] == '*' && + no_wildcard(path + item->nowildcard_len + 1)) + item->flags |= PATHSPEC_ONESTAR; + } } qsort(pathspec->items, pathspec->nr, diff --git a/dir.h b/dir.h index 0e8ae84628c..ab5af42b2ee 100644 --- a/dir.h +++ b/dir.h @@ -143,6 +143,7 @@ extern int fnmatch_icase(const char *pattern, const char *string, int flags); * The prefix part of pattern must not contains wildcards. */ #define GFNM_PATHNAME 1 /* similar to FNM_PATHNAME */ +#define GFNM_ONESTAR 2 /* there is only _one_ wildcard, a star */ extern int git_fnmatch(const char *pattern, const char *string, int flags, int prefix); diff --git a/tree-walk.c b/tree-walk.c index 2fcf3c02da2..585899ea24c 100644 --- a/tree-walk.c +++ b/tree-walk.c @@ -628,7 +628,8 @@ enum interesting tree_entry_interesting(const struct name_entry *entry, if (item->nowildcard_len < item->len) { if (!git_fnmatch(match + baselen, entry->path, - 0, item->nowildcard_len - baselen)) + item->flags & PATHSPEC_ONESTAR ? GFNM_ONESTAR : 0, + item->nowildcard_len - baselen)) return entry_interesting; /* @@ -654,7 +655,8 @@ match_wildcards: strbuf_add(base, entry->path, pathlen); if (!git_fnmatch(match, base->buf + base_offset, - 0, item->nowildcard_len)) { + item->flags & PATHSPEC_ONESTAR ? GFNM_ONESTAR : 0, + item->nowildcard_len)) { strbuf_setlen(base, base_offset + baselen); return entry_interesting; } From c904cd89e4328c0b5041b6ee0bb2df32459a33a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= <pclouds@gmail.com> Date: Sat, 24 Nov 2012 11:33:51 +0700 Subject: [PATCH 4/4] tree_entry_interesting: do basedir compare on wildcard patterns when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we treat "*.c" and "path/to/*.c" the same way. Which means we check all possible paths in repo against "path/to/*.c". One could see that "path/elsewhere/foo.c" obviously cannot match "path/to/*.c" and we only need to check all paths _inside_ "path/to/" against that pattern. This patch checks the leading fixed part of a pathspec against base directory and exit early if possible. We could even optimize further in "path/to/something*.c" case (i.e. check the fixed part against name_entry as well) but that's more complicated and probably does not gain us much. -O2 build on linux-2.6, without and with this patch respectively: $ time git rev-list --quiet HEAD -- 'drivers/*.c' real 1m9.484s user 1m9.128s sys 0m0.181s $ time ~/w/git/git rev-list --quiet HEAD -- 'drivers/*.c' real 0m15.710s user 0m15.564s sys 0m0.107s Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- tree-walk.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tree-walk.c b/tree-walk.c index 585899ea24c..6e30ef9d048 100644 --- a/tree-walk.c +++ b/tree-walk.c @@ -572,6 +572,54 @@ static int match_dir_prefix(const char *base, return 0; } +/* + * Perform matching on the leading non-wildcard part of + * pathspec. item->nowildcard_len must be greater than zero. Return + * non-zero if base is matched. + */ +static int match_wildcard_base(const struct pathspec_item *item, + const char *base, int baselen, + int *matched) +{ + const char *match = item->match; + /* the wildcard part is not considered in this function */ + int matchlen = item->nowildcard_len; + + if (baselen) { + int dirlen; + /* + * Return early if base is longer than the + * non-wildcard part but it does not match. + */ + if (baselen >= matchlen) { + *matched = matchlen; + return !strncmp(base, match, matchlen); + } + + dirlen = matchlen; + while (dirlen && match[dirlen - 1] != '/') + dirlen--; + + /* + * Return early if base is shorter than the + * non-wildcard part but it does not match. Note that + * base ends with '/' so we are sure it really matches + * directory + */ + if (strncmp(base, match, baselen)) + return 0; + *matched = baselen; + } else + *matched = 0; + /* + * we could have checked entry against the non-wildcard part + * that is not in base and does similar never_interesting + * optimization as in match_entry. For now just be happy with + * base comparison. + */ + return entry_interesting; +} + /* * Is a tree entry interesting given the pathspec we have? * @@ -602,7 +650,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry, const struct pathspec_item *item = ps->items+i; const char *match = item->match; const char *base_str = base->buf + base_offset; - int matchlen = item->len; + int matchlen = item->len, matched = 0; if (baselen >= matchlen) { /* If it doesn't match, move along... */ @@ -647,9 +695,24 @@ match_wildcards: if (item->nowildcard_len == item->len) continue; + if (item->nowildcard_len && + !match_wildcard_base(item, base_str, baselen, &matched)) + return entry_not_interesting; + /* * Concatenate base and entry->path into one and do * fnmatch() on it. + * + * While we could avoid concatenation in certain cases + * [1], which saves a memcpy and potentially a + * realloc, it turns out not worth it. Measurement on + * linux-2.6 does not show any clear improvements, + * partly because of the nowildcard_len optimization + * in git_fnmatch(). Avoid micro-optimizations here. + * + * [1] if match_wildcard_base() says the base + * directory is already matched, we only need to match + * the rest, which is shorter so _in theory_ faster. */ strbuf_add(base, entry->path, pathlen);