From 8b43fb18f808524297a050e33f33db8569bc1116 Mon Sep 17 00:00:00 2001 From: Jeff King <peff@peff.net> Date: Fri, 20 Mar 2015 14:43:02 -0400 Subject: [PATCH 1/5] t5312: test object deletion code paths in a corrupted repository When we are doing a destructive operation like "git prune", we want to be extra careful that the set of reachable tips we compute is valid. If there is any corruption or oddity, we are better off aborting the operation and letting the user figure things out rather than plowing ahead and possibly deleting some data that cannot be recovered. The tests here include: 1. Pruning objects mentioned only be refs with invalid names. This used to abort prior to d0f810f (refs.c: allow listing and deleting badly named refs, 2014-09-03), but since then we silently ignore the tip. Likewise, we test repacking that can drop objects (either "-ad", which drops anything unreachable, or "-Ad --unpack-unreachable=<time>", which tries to optimize out a loose object write that would be directly pruned). 2. Pruning objects when some refs point to missing objects. We don't know whether any dangling objects would have been reachable from the missing objects. We are better to keep them around, as they are better than nothing for helping the user recover history. 3. Packed refs that point to missing objects can sometimes be dropped. By itself, this is more of an annoyance (you do not have the object anyway; even if you can recover it from elsewhere, all you are losing is a placeholder for your state at the time of corruption). But coupled with (2), if we drop the ref and then go on to prune, we may lose unrecoverable objects. Note that we use test_might_fail for some of the operations. In some cases, it would be appropriate to abort the operation, and in others, it might be acceptable to continue but taking the information into account. The tests don't care either way, and check only for data loss. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- t/t5312-prune-corruption.sh | 114 ++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100755 t/t5312-prune-corruption.sh diff --git a/t/t5312-prune-corruption.sh b/t/t5312-prune-corruption.sh new file mode 100755 index 00000000000..496a9f5617e --- /dev/null +++ b/t/t5312-prune-corruption.sh @@ -0,0 +1,114 @@ +#!/bin/sh + +test_description=' +Test pruning of repositories with minor corruptions. The goal +here is that we should always be erring on the side of safety. So +if we see, for example, a ref with a bogus name, it is OK either to +bail out or to proceed using it as a reachable tip, but it is _not_ +OK to proceed as if it did not exist. Otherwise we might silently +delete objects that cannot be recovered. +' +. ./test-lib.sh + +test_expect_success 'disable reflogs' ' + git config core.logallrefupdates false && + rm -rf .git/logs +' + +test_expect_success 'create history reachable only from a bogus-named ref' ' + test_tick && git commit --allow-empty -m master && + base=$(git rev-parse HEAD) && + test_tick && git commit --allow-empty -m bogus && + bogus=$(git rev-parse HEAD) && + git cat-file commit $bogus >saved && + echo $bogus >.git/refs/heads/bogus..name && + git reset --hard HEAD^ +' + +test_expect_failure 'pruning does not drop bogus object' ' + test_when_finished "git hash-object -w -t commit saved" && + test_might_fail git prune --expire=now && + verbose git cat-file -e $bogus +' + +test_expect_success 'put bogus object into pack' ' + git tag reachable $bogus && + git repack -ad && + git tag -d reachable && + verbose git cat-file -e $bogus +' + +test_expect_failure 'destructive repack keeps packed object' ' + test_might_fail git repack -Ad --unpack-unreachable=now && + verbose git cat-file -e $bogus && + test_might_fail git repack -ad && + verbose git cat-file -e $bogus +' + +# subsequent tests will have different corruptions +test_expect_success 'clean up bogus ref' ' + rm .git/refs/heads/bogus..name +' + +# We create two new objects here, "one" and "two". Our +# master branch points to "two", which is deleted, +# corrupting the repository. But we'd like to make sure +# that the otherwise unreachable "one" is not pruned +# (since it is the user's best bet for recovering +# from the corruption). +# +# Note that we also point HEAD somewhere besides "two", +# as we want to make sure we test the case where we +# pick up the reference to "two" by iterating the refs, +# not by resolving HEAD. +test_expect_success 'create history with missing tip commit' ' + test_tick && git commit --allow-empty -m one && + recoverable=$(git rev-parse HEAD) && + git cat-file commit $recoverable >saved && + test_tick && git commit --allow-empty -m two && + missing=$(git rev-parse HEAD) && + git checkout --detach $base && + rm .git/objects/$(echo $missing | sed "s,..,&/,") && + test_must_fail git cat-file -e $missing +' + +test_expect_failure 'pruning with a corrupted tip does not drop history' ' + test_when_finished "git hash-object -w -t commit saved" && + test_might_fail git prune --expire=now && + verbose git cat-file -e $recoverable +' + +test_expect_success 'pack-refs does not silently delete broken loose ref' ' + git pack-refs --all --prune && + echo $missing >expect && + git rev-parse refs/heads/master >actual && + test_cmp expect actual +' + +# we do not want to count on running pack-refs to +# actually pack it, as it is perfectly reasonable to +# skip processing a broken ref +test_expect_success 'create packed-refs file with broken ref' ' + rm -f .git/refs/heads/master && + cat >.git/packed-refs <<-EOF && + $missing refs/heads/master + $recoverable refs/heads/other + EOF + echo $missing >expect && + git rev-parse refs/heads/master >actual && + test_cmp expect actual +' + +test_expect_success 'pack-refs does not silently delete broken packed ref' ' + git pack-refs --all --prune && + git rev-parse refs/heads/master >actual && + test_cmp expect actual +' + +test_expect_failure 'pack-refs does not drop broken refs during deletion' ' + git update-ref -d refs/heads/other && + git rev-parse refs/heads/master >actual && + test_cmp expect actual +' + +test_done From 49672f26d9a3826a6a74c6ff4d2409b7b0c74495 Mon Sep 17 00:00:00 2001 From: Jeff King <peff@peff.net> Date: Fri, 20 Mar 2015 14:43:06 -0400 Subject: [PATCH 2/5] refs: introduce a "ref paranoia" flag Most operations that iterate over refs are happy to ignore broken cruft. However, some operations should be performed with knowledge of these broken refs, because it is better for the operation to choke on a missing object than it is to silently pretend that the ref did not exist (e.g., if we are computing the set of reachable tips in order to prune objects). These processes could just call for_each_rawref, except that ref iteration is often hidden behind other interfaces. For instance, for a destructive "repack -ad", we would have to inform "pack-objects" that we are destructive, and then it would in turn have to tell the revision code that our "--all" should include broken refs. It's much simpler to just set a global for "dangerous" operations that includes broken refs in all iterations. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- Documentation/git.txt | 11 +++++++++++ cache.h | 8 ++++++++ environment.c | 1 + refs.c | 5 +++++ 4 files changed, 25 insertions(+) diff --git a/Documentation/git.txt b/Documentation/git.txt index af306209e93..8da85a604ea 100644 --- a/Documentation/git.txt +++ b/Documentation/git.txt @@ -1026,6 +1026,17 @@ GIT_ICASE_PATHSPECS:: variable when it is invoked as the top level command by the end user, to be recorded in the body of the reflog. +`GIT_REF_PARANOIA`:: + If set to `1`, include broken or badly named refs when iterating + over lists of refs. In a normal, non-corrupted repository, this + does nothing. However, enabling it may help git to detect and + abort some operations in the presence of broken refs. Git sets + this variable automatically when performing destructive + operations like linkgit:git-prune[1]. You should not need to set + it yourself unless you want to be paranoid about making sure + an operation has touched every ref (e.g., because you are + cloning a repository to make a backup). + Discussion[[Discussion]] ------------------------ diff --git a/cache.h b/cache.h index 4d02efc9054..23806394eb5 100644 --- a/cache.h +++ b/cache.h @@ -613,6 +613,14 @@ extern int precomposed_unicode; extern int protect_hfs; extern int protect_ntfs; +/* + * Include broken refs in all ref iterations, which will + * generally choke dangerous operations rather than letting + * them silently proceed without taking the broken ref into + * account. + */ +extern int ref_paranoia; + /* * The character that begins a commented line in user-editable file * that is subject to stripspace. diff --git a/environment.c b/environment.c index 1ade5c9684a..a40044c3bf8 100644 --- a/environment.c +++ b/environment.c @@ -24,6 +24,7 @@ int is_bare_repository_cfg = -1; /* unspecified */ int log_all_ref_updates = -1; /* unspecified */ int warn_ambiguous_refs = 1; int warn_on_object_refname_ambiguity = 1; +int ref_paranoia = -1; int repository_format_version; const char *git_commit_encoding; const char *git_log_output_encoding; diff --git a/refs.c b/refs.c index 9edf18b04e7..6c788814200 100644 --- a/refs.c +++ b/refs.c @@ -1907,6 +1907,11 @@ static int do_for_each_ref(struct ref_cache *refs, const char *base, data.fn = fn; data.cb_data = cb_data; + if (ref_paranoia < 0) + ref_paranoia = git_env_bool("GIT_REF_PARANOIA", 0); + if (ref_paranoia) + data.flags |= DO_FOR_EACH_INCLUDE_BROKEN; + return do_for_each_entry(refs, base, do_one_ref, &data); } From ff4056bbc35aa499f449cbab46702a76cea0e532 Mon Sep 17 00:00:00 2001 From: Jeff King <peff@peff.net> Date: Fri, 20 Mar 2015 14:43:09 -0400 Subject: [PATCH 3/5] prune: turn on ref_paranoia flag Prune should know about broken objects at the tips of refs, so that we can feed them to our traversal rather than ignoring them. It's better for us to abort the operation on the broken object than it is to start deleting objects with an incomplete view of the reachability namespace. Note that for missing objects, aborting is the best we can do. For a badly-named ref, we technically could use its sha1 as a reachability tip. However, the iteration code just feeds us a null sha1, so there would be a reasonable amount of code involved to pass down our wishes. It's not really worth trying to do better, because this is a case that should happen extremely rarely, and the message we provide: fatal: unable to parse object: refs/heads/bogus:name is probably enough to point the user in the right direction. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- builtin/prune.c | 1 + t/t5312-prune-corruption.sh | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/builtin/prune.c b/builtin/prune.c index 04d3b12ae4e..17094ad954c 100644 --- a/builtin/prune.c +++ b/builtin/prune.c @@ -115,6 +115,7 @@ int cmd_prune(int argc, const char **argv, const char *prefix) expire = ULONG_MAX; save_commit_buffer = 0; check_replace_refs = 0; + ref_paranoia = 1; init_revisions(&revs, prefix); argc = parse_options(argc, argv, prefix, options, prune_usage, 0); diff --git a/t/t5312-prune-corruption.sh b/t/t5312-prune-corruption.sh index 496a9f5617e..5ffb81715e0 100755 --- a/t/t5312-prune-corruption.sh +++ b/t/t5312-prune-corruption.sh @@ -25,7 +25,7 @@ test_expect_success 'create history reachable only from a bogus-named ref' ' git reset --hard HEAD^ ' -test_expect_failure 'pruning does not drop bogus object' ' +test_expect_success 'pruning does not drop bogus object' ' test_when_finished "git hash-object -w -t commit saved" && test_might_fail git prune --expire=now && verbose git cat-file -e $bogus @@ -72,7 +72,7 @@ test_expect_success 'create history with missing tip commit' ' test_must_fail git cat-file -e $missing ' -test_expect_failure 'pruning with a corrupted tip does not drop history' ' +test_expect_success 'pruning with a corrupted tip does not drop history' ' test_when_finished "git hash-object -w -t commit saved" && test_might_fail git prune --expire=now && verbose git cat-file -e $recoverable From 8d422993617be42a48a54fd7325d5ba5350c1082 Mon Sep 17 00:00:00 2001 From: Jeff King <peff@peff.net> Date: Fri, 20 Mar 2015 14:43:13 -0400 Subject: [PATCH 4/5] repack: turn on "ref paranoia" when doing a destructive repack If we are repacking with "-ad", we will drop any unreachable objects. Likewise, using "-Ad --unpack-unreachable=<time>" will drop any old, unreachable objects. In these cases, we want to make sure the reachability we compute with "--all" is complete. We can do this by passing GIT_REF_PARANOIA=1 in the environment to pack-objects. Note that "-Ad" is safe already, because it only loosens unreachable objects. It is up to "git prune" to avoid deleting them. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- builtin/repack.c | 8 ++++++-- t/t5312-prune-corruption.sh | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index 3f852f35d1e..2fe1b30d716 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -228,13 +228,17 @@ int cmd_repack(int argc, const char **argv, const char *prefix) get_non_kept_pack_filenames(&existing_packs); if (existing_packs.nr && delete_redundant) { - if (unpack_unreachable) + if (unpack_unreachable) { argv_array_pushf(&cmd.args, "--unpack-unreachable=%s", unpack_unreachable); - else if (pack_everything & LOOSEN_UNREACHABLE) + argv_array_push(&cmd.env_array, "GIT_REF_PARANOIA=1"); + } else if (pack_everything & LOOSEN_UNREACHABLE) { argv_array_push(&cmd.args, "--unpack-unreachable"); + } else { + argv_array_push(&cmd.env_array, "GIT_REF_PARANOIA=1"); + } } } else { argv_array_push(&cmd.args, "--unpacked"); diff --git a/t/t5312-prune-corruption.sh b/t/t5312-prune-corruption.sh index 5ffb81715e0..e8d04ef1bfb 100755 --- a/t/t5312-prune-corruption.sh +++ b/t/t5312-prune-corruption.sh @@ -38,7 +38,7 @@ test_expect_success 'put bogus object into pack' ' verbose git cat-file -e $bogus ' -test_expect_failure 'destructive repack keeps packed object' ' +test_expect_success 'destructive repack keeps packed object' ' test_might_fail git repack -Ad --unpack-unreachable=now && verbose git cat-file -e $bogus && test_might_fail git repack -ad && From ea56c4e02fc1d5ce6a6b5083c284e32ffc6367e6 Mon Sep 17 00:00:00 2001 From: Jeff King <peff@peff.net> Date: Fri, 20 Mar 2015 14:43:17 -0400 Subject: [PATCH 5/5] refs.c: drop curate_packed_refs When we delete a ref, we have to rewrite the entire packed-refs file. We take this opportunity to "curate" the packed-refs file and drop any entries that are crufty or broken. Dropping broken entries (e.g., with bogus names, or ones that point to missing objects) is actively a bad idea, as it means that we lose any notion that the data was there in the first place. Aside from the general hackiness that we might lose any information about ref "foo" while deleting an unrelated ref "bar", this may seriously hamper any attempts by the user at recovering from the corruption in "foo". They will lose the sha1 and name of "foo"; the exact pointer may still be useful even if they recover missing objects from a different copy of the repository. But worse, once the ref is gone, there is no trace of the corruption. A follow-up "git prune" may delete objects, even though it would otherwise bail when seeing corruption. We could just drop the "broken" bits from curate_packed_refs, and continue to drop the "crufty" bits: refs whose loose counterpart exists in the filesystem. This is not wrong to do, and it does have the advantage that we may write out a slightly smaller packed-refs file. But it has two disadvantages: 1. It is a potential source of races or mistakes with respect to these refs that are otherwise unrelated to the operation. To my knowledge, there aren't any active problems in this area, but it seems like an unnecessary risk. 2. We have to spend time looking up the matching loose refs for every item in the packed-refs file. If you have a large number of packed refs that do not change, that outweighs the benefit from writing out a smaller packed-refs file (it doesn't get smaller, and you do a bunch of directory traversal to find that out). Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- refs.c | 67 +------------------------------------ t/t5312-prune-corruption.sh | 2 +- 2 files changed, 2 insertions(+), 67 deletions(-) diff --git a/refs.c b/refs.c index 6c788814200..3a26ad4e65b 100644 --- a/refs.c +++ b/refs.c @@ -2596,68 +2596,10 @@ int pack_refs(unsigned int flags) return 0; } -/* - * If entry is no longer needed in packed-refs, add it to the string - * list pointed to by cb_data. Reasons for deleting entries: - * - * - Entry is broken. - * - Entry is overridden by a loose ref. - * - Entry does not point at a valid object. - * - * In the first and third cases, also emit an error message because these - * are indications of repository corruption. - */ -static int curate_packed_ref_fn(struct ref_entry *entry, void *cb_data) -{ - struct string_list *refs_to_delete = cb_data; - - if (entry->flag & REF_ISBROKEN) { - /* This shouldn't happen to packed refs. */ - error("%s is broken!", entry->name); - string_list_append(refs_to_delete, entry->name); - return 0; - } - if (!has_sha1_file(entry->u.value.sha1)) { - unsigned char sha1[20]; - int flags; - - if (read_ref_full(entry->name, 0, sha1, &flags)) - /* We should at least have found the packed ref. */ - die("Internal error"); - if ((flags & REF_ISSYMREF) || !(flags & REF_ISPACKED)) { - /* - * This packed reference is overridden by a - * loose reference, so it is OK that its value - * is no longer valid; for example, it might - * refer to an object that has been garbage - * collected. For this purpose we don't even - * care whether the loose reference itself is - * invalid, broken, symbolic, etc. Silently - * remove the packed reference. - */ - string_list_append(refs_to_delete, entry->name); - return 0; - } - /* - * There is no overriding loose reference, so the fact - * that this reference doesn't refer to a valid object - * indicates some kind of repository corruption. - * Report the problem, then omit the reference from - * the output. - */ - error("%s does not point to a valid object!", entry->name); - string_list_append(refs_to_delete, entry->name); - return 0; - } - - return 0; -} - int repack_without_refs(struct string_list *refnames, struct strbuf *err) { struct ref_dir *packed; - struct string_list refs_to_delete = STRING_LIST_INIT_DUP; - struct string_list_item *refname, *ref_to_delete; + struct string_list_item *refname; int ret, needs_repacking = 0, removed = 0; assert(err); @@ -2693,13 +2635,6 @@ int repack_without_refs(struct string_list *refnames, struct strbuf *err) return 0; } - /* Remove any other accumulated cruft */ - do_for_each_entry_in_dir(packed, 0, curate_packed_ref_fn, &refs_to_delete); - for_each_string_list_item(ref_to_delete, &refs_to_delete) { - if (remove_entry(packed, ref_to_delete->string) == -1) - die("internal error"); - } - /* Write what remains */ ret = commit_packed_refs(); if (ret) diff --git a/t/t5312-prune-corruption.sh b/t/t5312-prune-corruption.sh index e8d04ef1bfb..8e98b44083f 100755 --- a/t/t5312-prune-corruption.sh +++ b/t/t5312-prune-corruption.sh @@ -105,7 +105,7 @@ test_expect_success 'pack-refs does not silently delete broken packed ref' ' test_cmp expect actual ' -test_expect_failure 'pack-refs does not drop broken refs during deletion' ' +test_expect_success 'pack-refs does not drop broken refs during deletion' ' git update-ref -d refs/heads/other && git rev-parse refs/heads/master >actual && test_cmp expect actual