2018-04-02 22:34:19 +02:00
|
|
|
#include "git-compat-util.h"
|
2020-06-05 15:00:28 +02:00
|
|
|
#include "config.h"
|
2018-04-02 22:34:19 +02:00
|
|
|
#include "lockfile.h"
|
|
|
|
#include "pack.h"
|
|
|
|
#include "packfile.h"
|
|
|
|
#include "commit.h"
|
|
|
|
#include "object.h"
|
2018-06-27 15:24:45 +02:00
|
|
|
#include "refs.h"
|
2018-04-02 22:34:19 +02:00
|
|
|
#include "revision.h"
|
2020-12-31 12:56:23 +01:00
|
|
|
#include "hash-lookup.h"
|
2018-04-02 22:34:19 +02:00
|
|
|
#include "commit-graph.h"
|
2018-05-08 08:59:20 +02:00
|
|
|
#include "object-store.h"
|
2018-06-27 15:24:36 +02:00
|
|
|
#include "alloc.h"
|
2018-08-20 20:24:27 +02:00
|
|
|
#include "hashmap.h"
|
|
|
|
#include "replace-object.h"
|
commit-graph write: add progress output
Before this change the "commit-graph write" command didn't report any
progress. On my machine this command takes more than 10 seconds to
write the graph for linux.git, and around 1m30s on the
2015-04-03-1M-git.git[1] test repository (a test case for a large
monorepository).
Furthermore, since the gc.writeCommitGraph setting was added in
d5d5d7b641 ("gc: automatically write commit-graph files", 2018-06-27),
there was no indication at all from a "git gc" run that anything was
different. This why one of the progress bars being added here uses
start_progress() instead of start_delayed_progress(), so that it's
guaranteed to be seen. E.g. on my tiny 867 commit dotfiles.git
repository:
$ git -c gc.writeCommitGraph=true gc
Enumerating objects: 2821, done.
[...]
Computing commit graph generation numbers: 100% (867/867), done.
On larger repositories, such as linux.git the delayed progress bar(s)
will kick in, and we'll show what's going on instead of, as was
previously happening, printing nothing while we write the graph:
$ git -c gc.writeCommitGraph=true gc
[...]
Annotating commits in commit graph: 1565573, done.
Computing commit graph generation numbers: 100% (782484/782484), done.
Note that here we don't show "Finding commits for commit graph", this
is because under "git gc" we seed the search with the commit
references in the repository, and that set is too small to show any
progress, but would e.g. on a smaller repo such as git.git with
--stdin-commits:
$ git rev-list --all | git -c gc.writeCommitGraph=true write --stdin-commits
Finding commits for commit graph: 100% (162576/162576), done.
Computing commit graph generation numbers: 100% (162576/162576), done.
With --stdin-packs we don't show any estimation of how much is left to
do. This is because we might be processing more than one pack. We
could be less lazy here and show progress, either by detecting that
we're only processing one pack, or by first looping over the packs to
discover how many commits they have. I don't see the point in doing
that work. So instead we get (on 2015-04-03-1M-git.git):
$ echo pack-<HASH>.idx | git -c gc.writeCommitGraph=true --exec-path=$PWD commit-graph write --stdin-packs
Finding commits for commit graph: 13064614, done.
Annotating commits in commit graph: 3001341, done.
Computing commit graph generation numbers: 100% (1000447/1000447), done.
No GC mode uses --stdin-packs. It's what they use at Microsoft to
manually compute the generation numbers for their collection of large
packs which are never coalesced.
The reason we need a "report_progress" variable passed down from "git
gc" is so that we don't report this output when we're running in the
process "git gc --auto" detaches from the terminal.
Since we write the commit graph from the "git gc" process itself (as
opposed to what we do with say the "git repack" phase), we'd end up
writing the output to .git/gc.log and reporting it to the user next
time as part of the "The last gc run reported the following[...]"
error, see 329e6e8794 ("gc: save log from daemonized gc --auto and
print it next time", 2015-09-19).
So we must keep track of whether or not we're running in that
demonized mode, and if so print no progress.
See [2] and subsequent replies for a discussion of an approach not
taken in compute_generation_numbers(). I.e. we're saying "Computing
commit graph generation numbers", even though on an established
history we're mostly skipping over all the work we did in the
past. This is similar to the white lie we tell in the "Writing
objects" phase (not all are objects being written).
Always showing progress is considered more important than
accuracy. I.e. on a repository like 2015-04-03-1M-git.git we'd hang
for 6 seconds with no output on the second "git gc" if no changes were
made to any objects in the interim if we'd take the approach in [2].
1. https://github.com/avar/2015-04-03-1M-git
2. <c6960252-c095-fb2b-e0bc-b1e6bb261614@gmail.com>
(https://public-inbox.org/git/c6960252-c095-fb2b-e0bc-b1e6bb261614@gmail.com/)
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-09-17 17:33:35 +02:00
|
|
|
#include "progress.h"
|
2020-03-30 02:31:28 +02:00
|
|
|
#include "bloom.h"
|
2020-03-30 02:31:29 +02:00
|
|
|
#include "commit-slab.h"
|
2020-04-30 21:48:50 +02:00
|
|
|
#include "shallow.h"
|
2020-07-01 15:27:24 +02:00
|
|
|
#include "json-writer.h"
|
|
|
|
#include "trace2.h"
|
2018-04-02 22:34:19 +02:00
|
|
|
|
2020-04-16 22:14:03 +02:00
|
|
|
void git_test_write_commit_graph_or_die(void)
|
|
|
|
{
|
|
|
|
int flags = 0;
|
|
|
|
if (!git_env_bool(GIT_TEST_COMMIT_GRAPH, 0))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0))
|
|
|
|
flags = COMMIT_GRAPH_WRITE_BLOOM_FILTERS;
|
|
|
|
|
|
|
|
if (write_commit_graph_reachable(the_repository->objects->odb,
|
|
|
|
flags, NULL))
|
|
|
|
die("failed to write commit-graph under GIT_TEST_COMMIT_GRAPH");
|
|
|
|
}
|
|
|
|
|
2018-04-02 22:34:19 +02:00
|
|
|
#define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
|
|
|
|
#define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
|
|
|
|
#define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
|
|
|
|
#define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
|
commit-graph: rename "large edges" to "extra edges"
The optional 'Large Edge List' chunk of the commit graph file stores
parent information for commits with more than two parents, and the
names of most of the macros, variables, struct fields, and functions
related to this chunk contain the term "large edges", e.g.
write_graph_chunk_large_edges(). However, it's not a really great
term, as the edges to the second and subsequent parents stored in this
chunk are not any larger than the edges to the first and second
parents stored in the "main" 'Commit Data' chunk. It's the number of
edges, IOW number of parents, that is larger compared to non-merge and
"regular" two-parent merge commits. And indeed, two functions in
'commit-graph.c' have a local variable called 'num_extra_edges' that
refer to the same thing, and this "extra edges" term is much better at
describing these edges.
So let's rename all these references to "large edges" in macro,
variable, function, etc. names to "extra edges". There is a
GRAPH_OCTOPUS_EDGES_NEEDED macro as well; for the sake of consistency
rename it to GRAPH_EXTRA_EDGES_NEEDED.
We can do so safely without causing any incompatibility issues,
because the term "large edges" doesn't come up in the file format
itself in any form (the chunk's magic is {'E', 'D', 'G', 'E'}, there
is no 'L' in there), but only in the specification text. The string
"large edges", however, does come up in the output of 'git
commit-graph read' and in tests looking at its input, but that command
is explicitly documented as debugging aid, so we can change its output
and the affected tests safely.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-19 21:21:13 +01:00
|
|
|
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
|
2020-04-06 18:59:49 +02:00
|
|
|
#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
|
|
|
|
#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
|
2019-06-18 20:14:26 +02:00
|
|
|
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
|
2020-04-06 18:59:49 +02:00
|
|
|
#define MAX_NUM_CHUNKS 7
|
2018-04-02 22:34:19 +02:00
|
|
|
|
2018-11-14 05:09:35 +01:00
|
|
|
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
|
2018-04-02 22:34:19 +02:00
|
|
|
|
|
|
|
#define GRAPH_VERSION_1 0x1
|
|
|
|
#define GRAPH_VERSION GRAPH_VERSION_1
|
|
|
|
|
commit-graph: rename "large edges" to "extra edges"
The optional 'Large Edge List' chunk of the commit graph file stores
parent information for commits with more than two parents, and the
names of most of the macros, variables, struct fields, and functions
related to this chunk contain the term "large edges", e.g.
write_graph_chunk_large_edges(). However, it's not a really great
term, as the edges to the second and subsequent parents stored in this
chunk are not any larger than the edges to the first and second
parents stored in the "main" 'Commit Data' chunk. It's the number of
edges, IOW number of parents, that is larger compared to non-merge and
"regular" two-parent merge commits. And indeed, two functions in
'commit-graph.c' have a local variable called 'num_extra_edges' that
refer to the same thing, and this "extra edges" term is much better at
describing these edges.
So let's rename all these references to "large edges" in macro,
variable, function, etc. names to "extra edges". There is a
GRAPH_OCTOPUS_EDGES_NEEDED macro as well; for the sake of consistency
rename it to GRAPH_EXTRA_EDGES_NEEDED.
We can do so safely without causing any incompatibility issues,
because the term "large edges" doesn't come up in the file format
itself in any form (the chunk's magic is {'E', 'D', 'G', 'E'}, there
is no 'L' in there), but only in the specification text. The string
"large edges", however, does come up in the output of 'git
commit-graph read' and in tests looking at its input, but that command
is explicitly documented as debugging aid, so we can change its output
and the affected tests safely.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-19 21:21:13 +01:00
|
|
|
#define GRAPH_EXTRA_EDGES_NEEDED 0x80000000
|
2018-04-02 22:34:19 +02:00
|
|
|
#define GRAPH_EDGE_LAST_MASK 0x7fffffff
|
|
|
|
#define GRAPH_PARENT_NONE 0x70000000
|
|
|
|
|
|
|
|
#define GRAPH_LAST_EDGE 0x80000000
|
|
|
|
|
2018-06-27 15:24:28 +02:00
|
|
|
#define GRAPH_HEADER_SIZE 8
|
2018-04-02 22:34:19 +02:00
|
|
|
#define GRAPH_FANOUT_SIZE (4 * 256)
|
|
|
|
#define GRAPH_CHUNKLOOKUP_WIDTH 12
|
2018-06-27 15:24:28 +02:00
|
|
|
#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \
|
2018-11-14 05:09:35 +01:00
|
|
|
+ GRAPH_FANOUT_SIZE + the_hash_algo->rawsz)
|
2018-04-02 22:34:19 +02:00
|
|
|
|
commit-graph: fix writing first commit-graph during fetch
The previous commit includes a failing test for an issue around
fetch.writeCommitGraph and fetching in a repo with a submodule. Here, we
fix that bug and set the test to "test_expect_success".
The problem arises with this set of commands when the remote repo at
<url> has a submodule. Note that --recurse-submodules is not needed to
demonstrate the bug.
$ git clone <url> test
$ cd test
$ git -c fetch.writeCommitGraph=true fetch origin
Computing commit graph generation numbers: 100% (12/12), done.
BUG: commit-graph.c:886: missing parent <hash1> for commit <hash2>
Aborted (core dumped)
As an initial fix, I converted the code in builtin/fetch.c that calls
write_commit_graph_reachable() to instead launch a "git commit-graph
write --reachable --split" process. That code worked, but is not how we
want the feature to work long-term.
That test did demonstrate that the issue must be something to do with
internal state of the 'git fetch' process.
The write_commit_graph() method in commit-graph.c ensures the commits we
plan to write are "closed under reachability" using close_reachable().
This method walks from the input commits, and uses the UNINTERESTING
flag to mark which commits have already been visited. This allows the
walk to take O(N) time, where N is the number of commits, instead of
O(P) time, where P is the number of paths. (The number of paths can be
exponential in the number of commits.)
However, the UNINTERESTING flag is used in lots of places in the
codebase. This flag usually means some barrier to stop a commit walk,
such as in revision-walking to compare histories. It is not often
cleared after the walk completes because the starting points of those
walks do not have the UNINTERESTING flag, and clear_commit_marks() would
stop immediately.
This is happening during a 'git fetch' call with a remote. The fetch
negotiation is comparing the remote refs with the local refs and marking
some commits as UNINTERESTING.
I tested running clear_commit_marks_many() to clear the UNINTERESTING
flag inside close_reachable(), but the tips did not have the flag, so
that did nothing.
It turns out that the calculate_changed_submodule_paths() method is at
fault. Thanks, Peff, for pointing out this detail! More specifically,
for each submodule, the collect_changed_submodules() runs a revision
walk to essentially do file-history on the list of submodules. That
revision walk marks commits UNININTERESTING if they are simplified away
by not changing the submodule.
Instead, I finally arrived on the conclusion that I should use a flag
that is not used in any other part of the code. In commit-reach.c, a
number of flags were defined for commit walk algorithms. The REACHABLE
flag seemed like it made the most sense, and it seems it was not
actually used in the file. The REACHABLE flag was used in early versions
of commit-reach.c, but was removed by 4fbcca4 (commit-reach: make
can_all_from_reach... linear, 2018-07-20).
Add the REACHABLE flag to commit-graph.c and use it instead of
UNINTERESTING in close_reachable(). This fixes the bug in manual
testing.
Reported-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Helped-by: Jeff King <peff@peff.net>
Helped-by: Szeder Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-10-24 15:40:42 +02:00
|
|
|
/* Remember to update object flag allocation in object.h */
|
|
|
|
#define REACHABLE (1u<<15)
|
|
|
|
|
2020-03-30 02:31:29 +02:00
|
|
|
/* Keep track of the order in which commits are added to our list. */
|
|
|
|
define_commit_slab(commit_pos, int);
|
|
|
|
static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
|
|
|
|
|
|
|
|
static void set_commit_pos(struct repository *r, const struct object_id *oid)
|
|
|
|
{
|
|
|
|
static int32_t max_pos;
|
|
|
|
struct commit *commit = lookup_commit(r, oid);
|
|
|
|
|
|
|
|
if (!commit)
|
|
|
|
return; /* should never happen, but be lenient */
|
|
|
|
|
|
|
|
*commit_pos_at(&commit_pos, commit) = max_pos++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int commit_pos_cmp(const void *va, const void *vb)
|
2018-04-02 22:34:19 +02:00
|
|
|
{
|
2020-03-30 02:31:29 +02:00
|
|
|
const struct commit *a = *(const struct commit **)va;
|
|
|
|
const struct commit *b = *(const struct commit **)vb;
|
|
|
|
return commit_pos_at(&commit_pos, a) -
|
|
|
|
commit_pos_at(&commit_pos, b);
|
|
|
|
}
|
|
|
|
|
2020-06-17 11:14:09 +02:00
|
|
|
define_commit_slab(commit_graph_data_slab, struct commit_graph_data);
|
|
|
|
static struct commit_graph_data_slab commit_graph_data_slab =
|
|
|
|
COMMIT_SLAB_INIT(1, commit_graph_data_slab);
|
|
|
|
|
|
|
|
uint32_t commit_graph_position(const struct commit *c)
|
|
|
|
{
|
|
|
|
struct commit_graph_data *data =
|
|
|
|
commit_graph_data_slab_peek(&commit_graph_data_slab, c);
|
|
|
|
|
|
|
|
return data ? data->graph_pos : COMMIT_NOT_FROM_GRAPH;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t commit_graph_generation(const struct commit *c)
|
|
|
|
{
|
|
|
|
struct commit_graph_data *data =
|
|
|
|
commit_graph_data_slab_peek(&commit_graph_data_slab, c);
|
|
|
|
|
|
|
|
if (!data)
|
|
|
|
return GENERATION_NUMBER_INFINITY;
|
|
|
|
else if (data->graph_pos == COMMIT_NOT_FROM_GRAPH)
|
|
|
|
return GENERATION_NUMBER_INFINITY;
|
|
|
|
|
|
|
|
return data->generation;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct commit_graph_data *commit_graph_data_at(const struct commit *c)
|
|
|
|
{
|
|
|
|
unsigned int i, nth_slab;
|
|
|
|
struct commit_graph_data *data =
|
|
|
|
commit_graph_data_slab_peek(&commit_graph_data_slab, c);
|
|
|
|
|
|
|
|
if (data)
|
|
|
|
return data;
|
|
|
|
|
|
|
|
nth_slab = c->index / commit_graph_data_slab.slab_size;
|
|
|
|
data = commit_graph_data_slab_at(&commit_graph_data_slab, c);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* commit-slab initializes elements with zero, overwrite this with
|
|
|
|
* COMMIT_NOT_FROM_GRAPH for graph_pos.
|
|
|
|
*
|
|
|
|
* We avoid initializing generation with checking if graph position
|
|
|
|
* is not COMMIT_NOT_FROM_GRAPH.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < commit_graph_data_slab.slab_size; i++) {
|
|
|
|
commit_graph_data_slab.slab[nth_slab][i].graph_pos =
|
|
|
|
COMMIT_NOT_FROM_GRAPH;
|
|
|
|
}
|
|
|
|
|
|
|
|
return data;
|
|
|
|
}
|
|
|
|
|
2020-03-30 02:31:30 +02:00
|
|
|
static int commit_gen_cmp(const void *va, const void *vb)
|
|
|
|
{
|
|
|
|
const struct commit *a = *(const struct commit **)va;
|
|
|
|
const struct commit *b = *(const struct commit **)vb;
|
|
|
|
|
2020-06-17 11:14:11 +02:00
|
|
|
uint32_t generation_a = commit_graph_generation(a);
|
|
|
|
uint32_t generation_b = commit_graph_generation(b);
|
2020-03-30 02:31:30 +02:00
|
|
|
/* lower generation commits first */
|
2020-06-17 11:14:11 +02:00
|
|
|
if (generation_a < generation_b)
|
2020-03-30 02:31:30 +02:00
|
|
|
return -1;
|
2020-06-17 11:14:11 +02:00
|
|
|
else if (generation_a > generation_b)
|
2020-03-30 02:31:30 +02:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* use date as a heuristic when generations are equal */
|
|
|
|
if (a->date < b->date)
|
|
|
|
return -1;
|
|
|
|
else if (a->date > b->date)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-03-30 02:31:29 +02:00
|
|
|
char *get_commit_graph_filename(struct object_directory *obj_dir)
|
2018-04-02 22:34:19 +02:00
|
|
|
{
|
2020-03-30 02:31:29 +02:00
|
|
|
return xstrfmt("%s/info/commit-graph", obj_dir->path);
|
2018-04-02 22:34:19 +02:00
|
|
|
}
|
|
|
|
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-03 22:18:02 +01:00
|
|
|
static char *get_split_graph_filename(struct object_directory *odb,
|
2019-06-18 20:14:25 +02:00
|
|
|
const char *oid_hex)
|
|
|
|
{
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-03 22:18:02 +01:00
|
|
|
return xstrfmt("%s/info/commit-graphs/graph-%s.graph", odb->path,
|
|
|
|
oid_hex);
|
2019-06-18 20:14:25 +02:00
|
|
|
}
|
|
|
|
|
2020-09-17 20:11:46 +02:00
|
|
|
char *get_commit_graph_chain_filename(struct object_directory *odb)
|
2019-06-18 20:14:25 +02:00
|
|
|
{
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-03 22:18:02 +01:00
|
|
|
return xstrfmt("%s/info/commit-graphs/commit-graph-chain", odb->path);
|
2018-04-02 22:34:19 +02:00
|
|
|
}
|
|
|
|
|
2018-11-14 05:09:35 +01:00
|
|
|
static uint8_t oid_version(void)
|
|
|
|
{
|
2020-08-17 16:04:47 +02:00
|
|
|
switch (hash_algo_by_ptr(the_hash_algo)) {
|
|
|
|
case GIT_HASH_SHA1:
|
|
|
|
return 1;
|
|
|
|
case GIT_HASH_SHA256:
|
|
|
|
return 2;
|
|
|
|
default:
|
|
|
|
die(_("invalid hash version"));
|
|
|
|
}
|
2018-11-14 05:09:35 +01:00
|
|
|
}
|
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
static struct commit_graph *alloc_commit_graph(void)
|
|
|
|
{
|
|
|
|
struct commit_graph *g = xcalloc(1, sizeof(*g));
|
|
|
|
|
|
|
|
return g;
|
|
|
|
}
|
|
|
|
|
2018-08-20 20:24:27 +02:00
|
|
|
extern int read_replace_refs;
|
|
|
|
|
|
|
|
static int commit_graph_compatible(struct repository *r)
|
|
|
|
{
|
2018-08-20 20:24:32 +02:00
|
|
|
if (!r->gitdir)
|
|
|
|
return 0;
|
|
|
|
|
2018-08-20 20:24:27 +02:00
|
|
|
if (read_replace_refs) {
|
|
|
|
prepare_replace_object(r);
|
|
|
|
if (hashmap_get_size(&r->objects->replace_map->map))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-08-20 20:24:30 +02:00
|
|
|
prepare_commit_graft(r);
|
commit.c: don't persist substituted parents when unshallowing
Since 37b9dcabfc (shallow.c: use '{commit,rollback}_shallow_file',
2020-04-22), Git knows how to reset stat-validity checks for the
$GIT_DIR/shallow file, allowing it to change between a shallow and
non-shallow state in the same process (e.g., in the case of 'git fetch
--unshallow').
However, when $GIT_DIR/shallow changes, Git does not alter or remove any
grafts (nor substituted parents) in memory.
This comes up in a "git fetch --unshallow" with fetch.writeCommitGraph
set to true. Ordinarily in a shallow repository (and before 37b9dcabfc,
even in this case), commit_graph_compatible() would return false,
indicating that the repository should not be used to write a
commit-graphs (since commit-graph files cannot represent a shallow
history). But since 37b9dcabfc, in an --unshallow operation that check
succeeds.
Thus even though the repository isn't shallow any longer (that is, we
have all of the objects), the in-core representation of those objects
still has munged parents at the shallow boundaries. When the
commit-graph write proceeds, we use the incorrect parentage, producing
wrong results.
There are two ways for a user to work around this: either (1) set
'fetch.writeCommitGraph' to 'false', or (2) drop the commit-graph after
unshallowing.
One way to fix this would be to reset the parsed object pool entirely
(flushing the cache and thus preventing subsequent reads from modifying
their parents) after unshallowing. That would produce a problem when
callers have a now-stale reference to the old pool, and so this patch
implements a different approach. Instead, attach a new bit to the pool,
'substituted_parent', which indicates if the repository *ever* stored a
commit which had its parents modified (i.e., the shallow boundary
prior to unshallowing).
This bit needs to be sticky because all reads subsequent to modifying a
commit's parents are unreliable when unshallowing. Modify the check in
'commit_graph_compatible' to take this bit into account, and correctly
avoid generating commit-graphs in this case, thus solving the bug.
Helped-by: Derrick Stolee <dstolee@microsoft.com>
Helped-by: Jonathan Nieder <jrnieder@gmail.com>
Reported-by: Jay Conrod <jayconrod@google.com>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-08 23:10:53 +02:00
|
|
|
if (r->parsed_objects &&
|
|
|
|
(r->parsed_objects->grafts_nr || r->parsed_objects->substituted_parent))
|
2018-08-20 20:24:30 +02:00
|
|
|
return 0;
|
|
|
|
if (is_repository_shallow(r))
|
|
|
|
return 0;
|
|
|
|
|
2018-08-20 20:24:27 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2019-03-25 13:08:30 +01:00
|
|
|
int open_commit_graph(const char *graph_file, int *fd, struct stat *st)
|
|
|
|
{
|
|
|
|
*fd = git_open(graph_file);
|
|
|
|
if (*fd < 0)
|
|
|
|
return 0;
|
|
|
|
if (fstat(*fd, st)) {
|
|
|
|
close(*fd);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2020-09-09 17:22:56 +02:00
|
|
|
struct commit_graph *load_commit_graph_one_fd_st(struct repository *r,
|
|
|
|
int fd, struct stat *st,
|
2020-02-03 22:18:04 +01:00
|
|
|
struct object_directory *odb)
|
2018-04-10 14:56:02 +02:00
|
|
|
{
|
|
|
|
void *graph_map;
|
|
|
|
size_t graph_size;
|
2019-01-15 23:25:50 +01:00
|
|
|
struct commit_graph *ret;
|
2018-04-10 14:56:02 +02:00
|
|
|
|
2019-03-25 13:08:30 +01:00
|
|
|
graph_size = xsize_t(st->st_size);
|
2018-04-10 14:56:02 +02:00
|
|
|
|
|
|
|
if (graph_size < GRAPH_MIN_SIZE) {
|
|
|
|
close(fd);
|
2019-03-25 13:08:31 +01:00
|
|
|
error(_("commit-graph file is too small"));
|
2019-03-25 13:08:30 +01:00
|
|
|
return NULL;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
graph_map = xmmap(NULL, graph_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
2020-04-23 23:41:13 +02:00
|
|
|
close(fd);
|
2020-09-09 17:22:56 +02:00
|
|
|
ret = parse_commit_graph(r, graph_map, graph_size);
|
2019-01-15 23:25:50 +01:00
|
|
|
|
2020-02-03 22:18:04 +01:00
|
|
|
if (ret)
|
|
|
|
ret->odb = odb;
|
2020-04-23 23:41:13 +02:00
|
|
|
else
|
2019-01-15 23:25:50 +01:00
|
|
|
munmap(graph_map, graph_size);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
commit-graph: fix segfault on e.g. "git status"
When core.commitGraph=true is set, various common commands now consult
the commit graph. Because the commit-graph code is very trusting of
its input data, it's possibly to construct a graph that'll cause an
immediate segfault on e.g. "status" (and e.g. "log", "blame", ...). In
some other cases where git immediately exits with a cryptic error
about the graph being broken.
The root cause of this is that while the "commit-graph verify"
sub-command exhaustively verifies the graph, other users of the graph
simply trust the graph, and will e.g. deference data found at certain
offsets as pointers, causing segfaults.
This change does the bare minimum to ensure that we don't segfault in
the common fill_commit_in_graph() codepath called by
e.g. setup_revisions(), to do this instrument the "commit-graph
verify" tests to always check if "status" would subsequently
segfault. This fixes the following tests which would previously
segfault:
not ok 50 - detect low chunk count
not ok 51 - detect missing OID fanout chunk
not ok 52 - detect missing OID lookup chunk
not ok 53 - detect missing commit data chunk
Those happened because with the commit-graph enabled setup_revisions()
would eventually call fill_commit_in_graph(), where e.g.
g->chunk_commit_data is used early as an offset (and will be
0x0). With this change we get far enough to detect that the graph is
broken, and show an error instead. E.g.:
$ git status; echo $?
error: commit-graph is missing the Commit Data chunk
1
That also sucks, we should *warn* and not hard-fail "status" just
because the commit-graph is corrupt, but fixing is left to a follow-up
change.
A side-effect of changing the reporting from graph_report() to error()
is that we now have an "error: " prefix for these even for
"commit-graph verify". Pseudo-diff before/after:
$ git commit-graph verify
-commit-graph is missing the Commit Data chunk
+error: commit-graph is missing the Commit Data chunk
Changing that is OK. Various errors it emits now early on are prefixed
with "error: ", moving these over and changing the output doesn't
break anything.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-03-25 13:08:29 +01:00
|
|
|
static int verify_commit_graph_lite(struct commit_graph *g)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Basic validation shared between parse_commit_graph()
|
|
|
|
* which'll be called every time the graph is used, and the
|
|
|
|
* much more expensive verify_commit_graph() used by
|
|
|
|
* "commit-graph verify".
|
|
|
|
*
|
|
|
|
* There should only be very basic checks here to ensure that
|
|
|
|
* we don't e.g. segfault in fill_commit_in_graph(), but
|
|
|
|
* because this is a very hot codepath nothing that e.g. loops
|
|
|
|
* over g->num_commits, or runs a checksum on the commit-graph
|
|
|
|
* itself.
|
|
|
|
*/
|
|
|
|
if (!g->chunk_oid_fanout) {
|
|
|
|
error("commit-graph is missing the OID Fanout chunk");
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (!g->chunk_oid_lookup) {
|
|
|
|
error("commit-graph is missing the OID Lookup chunk");
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (!g->chunk_commit_data) {
|
|
|
|
error("commit-graph is missing the Commit Data chunk");
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-09 17:22:56 +02:00
|
|
|
struct commit_graph *parse_commit_graph(struct repository *r,
|
|
|
|
void *graph_map, size_t graph_size)
|
2019-01-15 23:25:50 +01:00
|
|
|
{
|
|
|
|
const unsigned char *data, *chunk_lookup;
|
|
|
|
uint32_t i;
|
|
|
|
struct commit_graph *graph;
|
2020-06-05 15:00:30 +02:00
|
|
|
uint64_t next_chunk_offset;
|
2019-01-15 23:25:50 +01:00
|
|
|
uint32_t graph_signature;
|
|
|
|
unsigned char graph_version, hash_version;
|
|
|
|
|
|
|
|
if (!graph_map)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (graph_size < GRAPH_MIN_SIZE)
|
|
|
|
return NULL;
|
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
data = (const unsigned char *)graph_map;
|
|
|
|
|
|
|
|
graph_signature = get_be32(data);
|
|
|
|
if (graph_signature != GRAPH_SIGNATURE) {
|
2019-03-25 13:08:34 +01:00
|
|
|
error(_("commit-graph signature %X does not match signature %X"),
|
2018-04-10 14:56:02 +02:00
|
|
|
graph_signature, GRAPH_SIGNATURE);
|
2019-01-15 23:25:50 +01:00
|
|
|
return NULL;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
graph_version = *(unsigned char*)(data + 4);
|
|
|
|
if (graph_version != GRAPH_VERSION) {
|
2019-03-25 13:08:34 +01:00
|
|
|
error(_("commit-graph version %X does not match version %X"),
|
2018-04-10 14:56:02 +02:00
|
|
|
graph_version, GRAPH_VERSION);
|
2019-01-15 23:25:50 +01:00
|
|
|
return NULL;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
hash_version = *(unsigned char*)(data + 5);
|
2018-11-14 05:09:35 +01:00
|
|
|
if (hash_version != oid_version()) {
|
2019-03-25 13:08:34 +01:00
|
|
|
error(_("commit-graph hash version %X does not match version %X"),
|
2018-11-14 05:09:35 +01:00
|
|
|
hash_version, oid_version());
|
2019-01-15 23:25:50 +01:00
|
|
|
return NULL;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
|
2020-09-09 17:23:10 +02:00
|
|
|
prepare_repo_settings(r);
|
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
graph = alloc_commit_graph();
|
|
|
|
|
2018-11-14 05:09:35 +01:00
|
|
|
graph->hash_len = the_hash_algo->rawsz;
|
2018-04-10 14:56:02 +02:00
|
|
|
graph->num_chunks = *(unsigned char*)(data + 6);
|
|
|
|
graph->data = graph_map;
|
|
|
|
graph->data_len = graph_size;
|
|
|
|
|
commit-graph: simplify parse_commit_graph() #1
While we iterate over all entries of the Chunk Lookup table we make
sure that we don't attempt to read past the end of the mmap-ed
commit-graph file, and check in each iteration that the chunk ID and
offset we are about to read is still within the mmap-ed memory region.
However, these checks in each iteration are not really necessary,
because the number of chunks in the commit-graph file is already known
before this loop from the just parsed commit-graph header.
So let's check that the commit-graph file is large enough for all
entries in the Chunk Lookup table before we start iterating over those
entries, and drop those per-iteration checks. While at it, take into
account the size of everything that is necessary to have a valid
commit-graph file, i.e. the size of the header, the size of the
mandatory OID Fanout chunk, and the size of the signature in the
trailer as well.
Note that this necessitates the change of the error message as well,
and, consequently, have to update the 'detect incorrect chunk count'
test in 't5318-commit-graph.sh' as well.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-05 15:00:29 +02:00
|
|
|
if (graph_size < GRAPH_HEADER_SIZE +
|
|
|
|
(graph->num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH +
|
|
|
|
GRAPH_FANOUT_SIZE + the_hash_algo->rawsz) {
|
|
|
|
error(_("commit-graph file is too small to hold %u chunks"),
|
|
|
|
graph->num_chunks);
|
|
|
|
free(graph);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
chunk_lookup = data + 8;
|
2020-06-05 15:00:30 +02:00
|
|
|
next_chunk_offset = get_be64(chunk_lookup + 4);
|
2018-04-10 14:56:02 +02:00
|
|
|
for (i = 0; i < graph->num_chunks; i++) {
|
2019-01-15 23:25:51 +01:00
|
|
|
uint32_t chunk_id;
|
2020-06-05 15:00:30 +02:00
|
|
|
uint64_t chunk_offset = next_chunk_offset;
|
2018-04-10 14:56:02 +02:00
|
|
|
int chunk_repeated = 0;
|
|
|
|
|
2019-01-15 23:25:51 +01:00
|
|
|
chunk_id = get_be32(chunk_lookup + 0);
|
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
chunk_lookup += GRAPH_CHUNKLOOKUP_WIDTH;
|
2020-06-05 15:00:30 +02:00
|
|
|
next_chunk_offset = get_be64(chunk_lookup + 4);
|
2018-04-10 14:56:02 +02:00
|
|
|
|
2018-11-14 05:09:35 +01:00
|
|
|
if (chunk_offset > graph_size - the_hash_algo->rawsz) {
|
2019-03-25 13:08:34 +01:00
|
|
|
error(_("commit-graph improper chunk offset %08x%08x"), (uint32_t)(chunk_offset >> 32),
|
2018-04-10 14:56:02 +02:00
|
|
|
(uint32_t)chunk_offset);
|
2020-05-04 21:13:24 +02:00
|
|
|
goto free_and_return;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
switch (chunk_id) {
|
|
|
|
case GRAPH_CHUNKID_OIDFANOUT:
|
|
|
|
if (graph->chunk_oid_fanout)
|
|
|
|
chunk_repeated = 1;
|
|
|
|
else
|
|
|
|
graph->chunk_oid_fanout = (uint32_t*)(data + chunk_offset);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case GRAPH_CHUNKID_OIDLOOKUP:
|
|
|
|
if (graph->chunk_oid_lookup)
|
|
|
|
chunk_repeated = 1;
|
2020-06-05 15:00:30 +02:00
|
|
|
else {
|
2018-04-10 14:56:02 +02:00
|
|
|
graph->chunk_oid_lookup = data + chunk_offset;
|
2020-06-05 15:00:30 +02:00
|
|
|
graph->num_commits = (next_chunk_offset - chunk_offset)
|
|
|
|
/ graph->hash_len;
|
|
|
|
}
|
2018-04-10 14:56:02 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case GRAPH_CHUNKID_DATA:
|
|
|
|
if (graph->chunk_commit_data)
|
|
|
|
chunk_repeated = 1;
|
|
|
|
else
|
|
|
|
graph->chunk_commit_data = data + chunk_offset;
|
|
|
|
break;
|
|
|
|
|
commit-graph: rename "large edges" to "extra edges"
The optional 'Large Edge List' chunk of the commit graph file stores
parent information for commits with more than two parents, and the
names of most of the macros, variables, struct fields, and functions
related to this chunk contain the term "large edges", e.g.
write_graph_chunk_large_edges(). However, it's not a really great
term, as the edges to the second and subsequent parents stored in this
chunk are not any larger than the edges to the first and second
parents stored in the "main" 'Commit Data' chunk. It's the number of
edges, IOW number of parents, that is larger compared to non-merge and
"regular" two-parent merge commits. And indeed, two functions in
'commit-graph.c' have a local variable called 'num_extra_edges' that
refer to the same thing, and this "extra edges" term is much better at
describing these edges.
So let's rename all these references to "large edges" in macro,
variable, function, etc. names to "extra edges". There is a
GRAPH_OCTOPUS_EDGES_NEEDED macro as well; for the sake of consistency
rename it to GRAPH_EXTRA_EDGES_NEEDED.
We can do so safely without causing any incompatibility issues,
because the term "large edges" doesn't come up in the file format
itself in any form (the chunk's magic is {'E', 'D', 'G', 'E'}, there
is no 'L' in there), but only in the specification text. The string
"large edges", however, does come up in the output of 'git
commit-graph read' and in tests looking at its input, but that command
is explicitly documented as debugging aid, so we can change its output
and the affected tests safely.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-19 21:21:13 +01:00
|
|
|
case GRAPH_CHUNKID_EXTRAEDGES:
|
|
|
|
if (graph->chunk_extra_edges)
|
2018-04-10 14:56:02 +02:00
|
|
|
chunk_repeated = 1;
|
|
|
|
else
|
commit-graph: rename "large edges" to "extra edges"
The optional 'Large Edge List' chunk of the commit graph file stores
parent information for commits with more than two parents, and the
names of most of the macros, variables, struct fields, and functions
related to this chunk contain the term "large edges", e.g.
write_graph_chunk_large_edges(). However, it's not a really great
term, as the edges to the second and subsequent parents stored in this
chunk are not any larger than the edges to the first and second
parents stored in the "main" 'Commit Data' chunk. It's the number of
edges, IOW number of parents, that is larger compared to non-merge and
"regular" two-parent merge commits. And indeed, two functions in
'commit-graph.c' have a local variable called 'num_extra_edges' that
refer to the same thing, and this "extra edges" term is much better at
describing these edges.
So let's rename all these references to "large edges" in macro,
variable, function, etc. names to "extra edges". There is a
GRAPH_OCTOPUS_EDGES_NEEDED macro as well; for the sake of consistency
rename it to GRAPH_EXTRA_EDGES_NEEDED.
We can do so safely without causing any incompatibility issues,
because the term "large edges" doesn't come up in the file format
itself in any form (the chunk's magic is {'E', 'D', 'G', 'E'}, there
is no 'L' in there), but only in the specification text. The string
"large edges", however, does come up in the output of 'git
commit-graph read' and in tests looking at its input, but that command
is explicitly documented as debugging aid, so we can change its output
and the affected tests safely.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-19 21:21:13 +01:00
|
|
|
graph->chunk_extra_edges = data + chunk_offset;
|
2018-04-10 14:56:02 +02:00
|
|
|
break;
|
2019-06-18 20:14:26 +02:00
|
|
|
|
|
|
|
case GRAPH_CHUNKID_BASE:
|
|
|
|
if (graph->chunk_base_graphs)
|
|
|
|
chunk_repeated = 1;
|
|
|
|
else
|
|
|
|
graph->chunk_base_graphs = data + chunk_offset;
|
2020-04-06 18:59:49 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case GRAPH_CHUNKID_BLOOMINDEXES:
|
|
|
|
if (graph->chunk_bloom_indexes)
|
|
|
|
chunk_repeated = 1;
|
2020-09-09 17:23:10 +02:00
|
|
|
else if (r->settings.commit_graph_read_changed_paths)
|
2020-04-06 18:59:49 +02:00
|
|
|
graph->chunk_bloom_indexes = data + chunk_offset;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case GRAPH_CHUNKID_BLOOMDATA:
|
|
|
|
if (graph->chunk_bloom_data)
|
|
|
|
chunk_repeated = 1;
|
2020-09-09 17:23:10 +02:00
|
|
|
else if (r->settings.commit_graph_read_changed_paths) {
|
2020-04-06 18:59:49 +02:00
|
|
|
uint32_t hash_version;
|
|
|
|
graph->chunk_bloom_data = data + chunk_offset;
|
|
|
|
hash_version = get_be32(data + chunk_offset);
|
|
|
|
|
|
|
|
if (hash_version != 1)
|
|
|
|
break;
|
|
|
|
|
|
|
|
graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
|
|
|
|
graph->bloom_filter_settings->hash_version = hash_version;
|
|
|
|
graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
|
|
|
|
graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
|
2020-09-17 15:34:42 +02:00
|
|
|
graph->bloom_filter_settings->max_changed_paths = DEFAULT_BLOOM_MAX_CHANGES;
|
2020-04-06 18:59:49 +02:00
|
|
|
}
|
|
|
|
break;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (chunk_repeated) {
|
2019-03-25 13:08:34 +01:00
|
|
|
error(_("commit-graph chunk id %08x appears multiple times"), chunk_id);
|
2020-05-04 21:13:24 +02:00
|
|
|
goto free_and_return;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-06 18:59:49 +02:00
|
|
|
if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
|
|
|
|
init_bloom_filters();
|
|
|
|
} else {
|
|
|
|
/* We need both the bloom chunks to exist together. Else ignore the data */
|
|
|
|
graph->chunk_bloom_indexes = NULL;
|
|
|
|
graph->chunk_bloom_data = NULL;
|
2020-05-04 21:13:24 +02:00
|
|
|
FREE_AND_NULL(graph->bloom_filter_settings);
|
2020-04-06 18:59:49 +02:00
|
|
|
}
|
|
|
|
|
2019-06-18 20:14:26 +02:00
|
|
|
hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
|
|
|
|
|
2020-05-04 21:13:24 +02:00
|
|
|
if (verify_commit_graph_lite(graph))
|
|
|
|
goto free_and_return;
|
commit-graph: fix segfault on e.g. "git status"
When core.commitGraph=true is set, various common commands now consult
the commit graph. Because the commit-graph code is very trusting of
its input data, it's possibly to construct a graph that'll cause an
immediate segfault on e.g. "status" (and e.g. "log", "blame", ...). In
some other cases where git immediately exits with a cryptic error
about the graph being broken.
The root cause of this is that while the "commit-graph verify"
sub-command exhaustively verifies the graph, other users of the graph
simply trust the graph, and will e.g. deference data found at certain
offsets as pointers, causing segfaults.
This change does the bare minimum to ensure that we don't segfault in
the common fill_commit_in_graph() codepath called by
e.g. setup_revisions(), to do this instrument the "commit-graph
verify" tests to always check if "status" would subsequently
segfault. This fixes the following tests which would previously
segfault:
not ok 50 - detect low chunk count
not ok 51 - detect missing OID fanout chunk
not ok 52 - detect missing OID lookup chunk
not ok 53 - detect missing commit data chunk
Those happened because with the commit-graph enabled setup_revisions()
would eventually call fill_commit_in_graph(), where e.g.
g->chunk_commit_data is used early as an offset (and will be
0x0). With this change we get far enough to detect that the graph is
broken, and show an error instead. E.g.:
$ git status; echo $?
error: commit-graph is missing the Commit Data chunk
1
That also sucks, we should *warn* and not hard-fail "status" just
because the commit-graph is corrupt, but fixing is left to a follow-up
change.
A side-effect of changing the reporting from graph_report() to error()
is that we now have an "error: " prefix for these even for
"commit-graph verify". Pseudo-diff before/after:
$ git commit-graph verify
-commit-graph is missing the Commit Data chunk
+error: commit-graph is missing the Commit Data chunk
Changing that is OK. Various errors it emits now early on are prefixed
with "error: ", moving these over and changing the output doesn't
break anything.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-03-25 13:08:29 +01:00
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
return graph;
|
2020-05-04 21:13:24 +02:00
|
|
|
|
|
|
|
free_and_return:
|
|
|
|
free(graph->bloom_filter_settings);
|
|
|
|
free(graph);
|
|
|
|
return NULL;
|
2018-04-10 14:56:02 +02:00
|
|
|
}
|
|
|
|
|
2020-09-09 17:22:56 +02:00
|
|
|
static struct commit_graph *load_commit_graph_one(struct repository *r,
|
|
|
|
const char *graph_file,
|
2020-02-03 22:18:04 +01:00
|
|
|
struct object_directory *odb)
|
2019-03-25 13:08:30 +01:00
|
|
|
{
|
|
|
|
|
|
|
|
struct stat st;
|
|
|
|
int fd;
|
2019-06-18 20:14:27 +02:00
|
|
|
struct commit_graph *g;
|
2019-03-25 13:08:30 +01:00
|
|
|
int open_ok = open_commit_graph(graph_file, &fd, &st);
|
|
|
|
|
|
|
|
if (!open_ok)
|
|
|
|
return NULL;
|
|
|
|
|
2020-09-09 17:22:56 +02:00
|
|
|
g = load_commit_graph_one_fd_st(r, fd, &st, odb);
|
2019-06-18 20:14:27 +02:00
|
|
|
|
|
|
|
if (g)
|
|
|
|
g->filename = xstrdup(graph_file);
|
|
|
|
|
|
|
|
return g;
|
2019-03-25 13:08:30 +01:00
|
|
|
}
|
|
|
|
|
2020-02-03 22:18:00 +01:00
|
|
|
static struct commit_graph *load_commit_graph_v1(struct repository *r,
|
|
|
|
struct object_directory *odb)
|
2019-06-18 20:14:25 +02:00
|
|
|
{
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-03 22:18:02 +01:00
|
|
|
char *graph_name = get_commit_graph_filename(odb);
|
2020-09-09 17:22:56 +02:00
|
|
|
struct commit_graph *g = load_commit_graph_one(r, graph_name, odb);
|
2019-06-18 20:14:25 +02:00
|
|
|
free(graph_name);
|
|
|
|
|
|
|
|
return g;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int add_graph_to_chain(struct commit_graph *g,
|
|
|
|
struct commit_graph *chain,
|
|
|
|
struct object_id *oids,
|
|
|
|
int n)
|
|
|
|
{
|
|
|
|
struct commit_graph *cur_g = chain;
|
|
|
|
|
2019-06-18 20:14:26 +02:00
|
|
|
if (n && !g->chunk_base_graphs) {
|
|
|
|
warning(_("commit-graph has no base graphs chunk"));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-06-18 20:14:25 +02:00
|
|
|
while (n) {
|
|
|
|
n--;
|
2019-06-18 20:14:26 +02:00
|
|
|
|
|
|
|
if (!cur_g ||
|
|
|
|
!oideq(&oids[n], &cur_g->oid) ||
|
|
|
|
!hasheq(oids[n].hash, g->chunk_base_graphs + g->hash_len * n)) {
|
|
|
|
warning(_("commit-graph chain does not match"));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-06-18 20:14:25 +02:00
|
|
|
cur_g = cur_g->base_graph;
|
|
|
|
}
|
|
|
|
|
|
|
|
g->base_graph = chain;
|
|
|
|
|
|
|
|
if (chain)
|
|
|
|
g->num_commits_in_base = chain->num_commits + chain->num_commits_in_base;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2020-02-03 22:18:00 +01:00
|
|
|
static struct commit_graph *load_commit_graph_chain(struct repository *r,
|
|
|
|
struct object_directory *odb)
|
2019-06-18 20:14:25 +02:00
|
|
|
{
|
|
|
|
struct commit_graph *graph_chain = NULL;
|
|
|
|
struct strbuf line = STRBUF_INIT;
|
|
|
|
struct stat st;
|
|
|
|
struct object_id *oids;
|
|
|
|
int i = 0, valid = 1, count;
|
2020-09-17 20:11:46 +02:00
|
|
|
char *chain_name = get_commit_graph_chain_filename(odb);
|
2019-06-18 20:14:25 +02:00
|
|
|
FILE *fp;
|
|
|
|
int stat_res;
|
|
|
|
|
|
|
|
fp = fopen(chain_name, "r");
|
|
|
|
stat_res = stat(chain_name, &st);
|
|
|
|
free(chain_name);
|
|
|
|
|
|
|
|
if (!fp ||
|
|
|
|
stat_res ||
|
|
|
|
st.st_size <= the_hash_algo->hexsz)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
count = st.st_size / (the_hash_algo->hexsz + 1);
|
|
|
|
oids = xcalloc(count, sizeof(struct object_id));
|
|
|
|
|
2019-06-18 20:14:30 +02:00
|
|
|
prepare_alt_odb(r);
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++) {
|
|
|
|
struct object_directory *odb;
|
2019-06-18 20:14:25 +02:00
|
|
|
|
|
|
|
if (strbuf_getline_lf(&line, fp) == EOF)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (get_oid_hex(line.buf, &oids[i])) {
|
|
|
|
warning(_("invalid commit-graph chain: line '%s' not a hash"),
|
|
|
|
line.buf);
|
|
|
|
valid = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-06-18 20:14:30 +02:00
|
|
|
valid = 0;
|
|
|
|
for (odb = r->objects->odb; odb; odb = odb->next) {
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-03 22:18:02 +01:00
|
|
|
char *graph_name = get_split_graph_filename(odb, line.buf);
|
2020-09-09 17:22:56 +02:00
|
|
|
struct commit_graph *g = load_commit_graph_one(r, graph_name, odb);
|
2019-06-18 20:14:25 +02:00
|
|
|
|
2019-06-18 20:14:30 +02:00
|
|
|
free(graph_name);
|
|
|
|
|
|
|
|
if (g) {
|
|
|
|
if (add_graph_to_chain(g, graph_chain, oids, i)) {
|
|
|
|
graph_chain = g;
|
|
|
|
valid = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!valid) {
|
|
|
|
warning(_("unable to find all commit-graph files"));
|
|
|
|
break;
|
|
|
|
}
|
2019-06-18 20:14:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
free(oids);
|
|
|
|
fclose(fp);
|
2019-08-07 13:15:02 +02:00
|
|
|
strbuf_release(&line);
|
2019-06-18 20:14:25 +02:00
|
|
|
|
|
|
|
return graph_chain;
|
|
|
|
}
|
|
|
|
|
|