2006-08-05 08:04:21 +02:00
|
|
|
#include "builtin.h"
|
|
|
|
#include "cache.h"
|
2018-03-23 18:20:59 +01:00
|
|
|
#include "repository.h"
|
2017-06-14 20:07:36 +02:00
|
|
|
#include "config.h"
|
2014-10-01 12:28:42 +02:00
|
|
|
#include "lockfile.h"
|
2006-08-05 08:04:21 +02:00
|
|
|
#include "object.h"
|
|
|
|
#include "blob.h"
|
2006-08-14 06:58:19 +02:00
|
|
|
#include "tree.h"
|
2007-02-06 22:08:06 +01:00
|
|
|
#include "commit.h"
|
2006-08-05 08:04:21 +02:00
|
|
|
#include "delta.h"
|
|
|
|
#include "pack.h"
|
2006-08-14 06:58:19 +02:00
|
|
|
#include "refs.h"
|
2006-08-05 08:04:21 +02:00
|
|
|
#include "csum-file.h"
|
2006-08-15 02:16:28 +02:00
|
|
|
#include "quote.h"
|
2010-10-03 11:56:46 +02:00
|
|
|
#include "dir.h"
|
2016-04-25 23:17:28 +02:00
|
|
|
#include "run-command.h"
|
2017-08-19 00:20:16 +02:00
|
|
|
#include "packfile.h"
|
2018-03-23 18:20:59 +01:00
|
|
|
#include "object-store.h"
|
2018-04-11 20:37:55 +02:00
|
|
|
#include "mem-pool.h"
|
2018-07-20 18:33:04 +02:00
|
|
|
#include "commit-reach.h"
|
fast-import: add options for rewriting submodules
When converting a repository using submodules from one hash algorithm to
another, it is necessary to rewrite the submodules from the old
algorithm to the new algorithm, since only references to submodules, not
their contents, are written to the fast-export stream. Without rewriting
the submodules, fast-import fails with an "Invalid dataref" error when
encountering a submodule in another algorithm.
Add a pair of options, --rewrite-submodules-from and
--rewrite-submodules-to, that take a list of marks produced by
fast-export and fast-import, respectively, when processing the
submodule. Use these marks to map the submodule commits from the old
algorithm to the new algorithm.
We read marks into two corresponding struct mark_set objects and then
perform a mapping from the old to the new using a hash table. This lets
us reuse the same mark parsing code that is used elsewhere and allows us
to efficiently read and match marks based on their ID, since mark files
need not be sorted.
Note that because we're using a khash table for the object IDs, and this
table copies values of struct object_id instead of taking references to
them, it's necessary to zero the struct object_id values that we use to
insert and look up in the table. Otherwise, we would end up with SHA-1
values that don't match because of whatever stack garbage might be left
in the unused area.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-22 21:17:49 +01:00
|
|
|
#include "khash.h"
|
date API: create a date.h, split from cache.h
Move the declaration of the date.c functions from cache.h, and adjust
the relevant users to include the new date.h header.
The show_ident_date() function belonged in pretty.h (it's defined in
pretty.c), its two users outside of pretty.c didn't strictly need to
include pretty.h, as they get it indirectly, but let's add it to them
anyway.
Similarly, the change to "builtin/{fast-import,show-branch,tag}.c"
isn't needed as far as the compiler is concerned, but since they all
use the "DATE_MODE()" macro we now define in date.h, let's have them
include it.
We could simply include this new header in "cache.h", but as this
change shows these functions weren't common enough to warrant
including in it in the first place. By moving them out of cache.h
changes to this API will no longer cause a (mostly) full re-build of
the project when "make" is run.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 09:14:02 +01:00
|
|
|
#include "date.h"
|
2006-08-05 08:04:21 +02:00
|
|
|
|
2007-01-17 08:42:43 +01:00
|
|
|
#define PACK_ID_BITS 16
|
|
|
|
#define MAX_PACK_ID ((1<<PACK_ID_BITS)-1)
|
2007-11-14 05:48:42 +01:00
|
|
|
#define DEPTH_BITS 13
|
|
|
|
#define MAX_DEPTH ((1<<DEPTH_BITS)-1)
|
2007-01-17 08:42:43 +01:00
|
|
|
|
2011-08-14 20:32:24 +02:00
|
|
|
/*
|
|
|
|
* We abuse the setuid bit on directories to mean "do not delta".
|
|
|
|
*/
|
|
|
|
#define NO_DELTA S_ISUID
|
|
|
|
|
2019-02-19 01:05:05 +01:00
|
|
|
/*
|
|
|
|
* The amount of additional space required in order to write an object into the
|
|
|
|
* current pack. This is the hash lengths at the end of the pack, plus the
|
|
|
|
* length of one object ID.
|
|
|
|
*/
|
|
|
|
#define PACK_SIZE_THRESHOLD (the_hash_algo->rawsz * 3)
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct object_entry {
|
2010-02-17 20:05:51 +01:00
|
|
|
struct pack_idx_entry idx;
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
struct hashmap_entry ent;
|
2007-11-14 05:48:42 +01:00
|
|
|
uint32_t type : TYPE_BITS,
|
|
|
|
pack_id : PACK_ID_BITS,
|
|
|
|
depth : DEPTH_BITS;
|
2006-08-08 06:03:59 +02:00
|
|
|
};
|
|
|
|
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
static int object_entry_hashcmp(const void *map_data,
|
|
|
|
const struct hashmap_entry *eptr,
|
|
|
|
const struct hashmap_entry *entry_or_key,
|
|
|
|
const void *keydata)
|
|
|
|
{
|
|
|
|
const struct object_id *oid = keydata;
|
|
|
|
const struct object_entry *e1, *e2;
|
|
|
|
|
|
|
|
e1 = container_of(eptr, const struct object_entry, ent);
|
|
|
|
if (oid)
|
|
|
|
return oidcmp(&e1->idx.oid, oid);
|
|
|
|
|
|
|
|
e2 = container_of(entry_or_key, const struct object_entry, ent);
|
|
|
|
return oidcmp(&e1->idx.oid, &e2->idx.oid);
|
|
|
|
}
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct object_entry_pool {
|
2006-08-14 06:58:19 +02:00
|
|
|
struct object_entry_pool *next_pool;
|
2006-08-08 06:03:59 +02:00
|
|
|
struct object_entry *next_free;
|
|
|
|
struct object_entry *end;
|
2006-08-08 06:46:13 +02:00
|
|
|
struct object_entry entries[FLEX_ARRAY]; /* more */
|
2006-08-08 06:03:59 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct mark_set {
|
2006-08-23 10:17:45 +02:00
|
|
|
union {
|
fast-import: add options for rewriting submodules
When converting a repository using submodules from one hash algorithm to
another, it is necessary to rewrite the submodules from the old
algorithm to the new algorithm, since only references to submodules, not
their contents, are written to the fast-export stream. Without rewriting
the submodules, fast-import fails with an "Invalid dataref" error when
encountering a submodule in another algorithm.
Add a pair of options, --rewrite-submodules-from and
--rewrite-submodules-to, that take a list of marks produced by
fast-export and fast-import, respectively, when processing the
submodule. Use these marks to map the submodule commits from the old
algorithm to the new algorithm.
We read marks into two corresponding struct mark_set objects and then
perform a mapping from the old to the new using a hash table. This lets
us reuse the same mark parsing code that is used elsewhere and allows us
to efficiently read and match marks based on their ID, since mark files
need not be sorted.
Note that because we're using a khash table for the object IDs, and this
table copies values of struct object_id instead of taking references to
them, it's necessary to zero the struct object_id values that we use to
insert and look up in the table. Otherwise, we would end up with SHA-1
values that don't match because of whatever stack garbage might be left
in the unused area.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-22 21:17:49 +01:00
|
|
|
struct object_id *oids[1024];
|
2006-08-23 10:17:45 +02:00
|
|
|
struct object_entry *marked[1024];
|
|
|
|
struct mark_set *sets[1024];
|
|
|
|
} data;
|
2007-01-17 06:57:23 +01:00
|
|
|
unsigned int shift;
|
2006-08-23 10:17:45 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct last_object {
|
2007-09-17 14:00:38 +02:00
|
|
|
struct strbuf data;
|
2010-02-17 20:05:54 +01:00
|
|
|
off_t offset;
|
2006-08-08 09:36:45 +02:00
|
|
|
unsigned int depth;
|
2007-09-17 14:00:38 +02:00
|
|
|
unsigned no_swap : 1;
|
2006-08-08 06:46:13 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct atom_str {
|
2006-08-14 06:58:19 +02:00
|
|
|
struct atom_str *next_atom;
|
2007-02-05 22:34:56 +01:00
|
|
|
unsigned short str_len;
|
2006-08-14 06:58:19 +02:00
|
|
|
char str_dat[FLEX_ARRAY]; /* more */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tree_content;
|
2011-03-16 08:08:34 +01:00
|
|
|
struct tree_entry {
|
2006-08-14 06:58:19 +02:00
|
|
|
struct tree_content *tree;
|
2009-05-01 11:06:36 +02:00
|
|
|
struct atom_str *name;
|
2011-03-16 08:08:34 +01:00
|
|
|
struct tree_entry_ms {
|
2007-02-05 22:34:56 +01:00
|
|
|
uint16_t mode;
|
2017-05-01 04:29:03 +02:00
|
|
|
struct object_id oid;
|
2006-08-28 18:22:50 +02:00
|
|
|
} versions[2];
|
2006-08-08 09:36:45 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct tree_content {
|
2006-08-14 06:58:19 +02:00
|
|
|
unsigned int entry_capacity; /* must match avail_tree_content */
|
|
|
|
unsigned int entry_count;
|
2006-08-28 18:22:50 +02:00
|
|
|
unsigned int delta_depth;
|
2006-08-14 06:58:19 +02:00
|
|
|
struct tree_entry *entries[FLEX_ARRAY]; /* more */
|
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct avail_tree_content {
|
2006-08-14 06:58:19 +02:00
|
|
|
unsigned int entry_capacity; /* must match tree_content */
|
|
|
|
struct avail_tree_content *next_avail;
|
2006-08-08 09:36:45 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct branch {
|
2006-08-14 06:58:19 +02:00
|
|
|
struct branch *table_next_branch;
|
|
|
|
struct branch *active_next_branch;
|
2006-08-08 09:36:45 +02:00
|
|
|
const char *name;
|
2006-08-14 06:58:19 +02:00
|
|
|
struct tree_entry branch_tree;
|
2007-01-17 08:42:43 +01:00
|
|
|
uintmax_t last_commit;
|
2009-12-07 12:27:24 +01:00
|
|
|
uintmax_t num_notes;
|
2007-03-05 18:31:09 +01:00
|
|
|
unsigned active : 1;
|
2014-04-20 20:59:27 +02:00
|
|
|
unsigned delete : 1;
|
2007-03-05 18:31:09 +01:00
|
|
|
unsigned pack_id : PACK_ID_BITS;
|
2017-05-01 04:29:03 +02:00
|
|
|
struct object_id oid;
|
2006-08-08 09:36:45 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct tag {
|
2006-08-24 09:12:13 +02:00
|
|
|
struct tag *next_tag;
|
|
|
|
const char *name;
|
2007-01-16 22:18:44 +01:00
|
|
|
unsigned int pack_id;
|
2017-05-01 04:29:03 +02:00
|
|
|
struct object_id oid;
|
2006-08-24 09:12:13 +02:00
|
|
|
};
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct hash_list {
|
2007-01-12 04:21:38 +01:00
|
|
|
struct hash_list *next;
|
2017-05-01 04:29:03 +02:00
|
|
|
struct object_id oid;
|
2007-01-12 04:21:38 +01:00
|
|
|
};
|
2006-08-14 06:58:19 +02:00
|
|
|
|
2007-02-06 20:58:30 +01:00
|
|
|
typedef enum {
|
|
|
|
WHENSPEC_RAW = 1,
|
2020-05-30 22:25:57 +02:00
|
|
|
WHENSPEC_RAW_PERMISSIVE,
|
2007-02-06 20:58:30 +01:00
|
|
|
WHENSPEC_RFC2822,
|
2010-05-14 11:31:35 +02:00
|
|
|
WHENSPEC_NOW
|
2007-02-06 20:58:30 +01:00
|
|
|
} whenspec_type;
|
|
|
|
|
2011-03-16 08:08:34 +01:00
|
|
|
struct recent_command {
|
2007-08-03 10:47:04 +02:00
|
|
|
struct recent_command *prev;
|
|
|
|
struct recent_command *next;
|
|
|
|
char *buf;
|
|
|
|
};
|
|
|
|
|
fast-import: fix over-allocation of marks storage
Fast-import stores its marks in a trie-like structure made of mark_set
structs. Each struct has a fixed size (1024). If our id number is too
large to fit in the struct, then we allocate a new struct which shifts
the id number by 10 bits. Our original struct becomes a child node
of this new layer, and the new struct becomes the top level of the trie.
This scheme was broken by ddddf8d7e2 (fast-import: permit reading
multiple marks files, 2020-02-22). Before then, we had a top-level
"marks" pointer, and the push-down worked by assigning the new top-level
struct to "marks". But after that commit, insert_mark() takes a pointer
to the mark_set, rather than using the global "marks". It continued to
assign to the global "marks" variable during the push down, which was
wrong for two reasons:
- we added a call in option_rewrite_submodules() which uses a separate
mark set; pushing down on "marks" is outright wrong here. We'd
corrupt the "marks" set, and we'd fail to correctly store any
submodule mappings with an id over 1024.
- the other callers passed "marks", but the push-down was still wrong.
In read_mark_file(), we take the pointer to the mark_set as a
parameter. So even though insert_mark() was updating the global
"marks", the local pointer we had in read_mark_file() was not
updated. As a result, we'd add a new level when needed, but then the
next call to insert_mark() wouldn't see it! It would then allocate a
new layer, which would also not be seen, and so on. Lookups for the
lost layers obviously wouldn't work, but before we even hit any
lookup stage, we'd generally run out of memory and die.
Our tests didn't notice either of these cases because they didn't have
enough marks to trigger the push-down behavior. The new tests in t9304
cover both cases (and fail without this patch).
We can solve the problem by having insert_mark() take a pointer-to-pointer
of the top-level of the set. Then our push down can assign to it in a
way that the caller actually sees. Note the subtle reordering in
option_rewrite_submodules(). Our call to read_mark_file() may modify our
top-level set pointer, so we have to wait until after it returns to
assign its value into the string_list.
Reported-by: Sergey Brester <serg.brester@sebres.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-10-15 17:38:49 +02:00
|
|
|
typedef void (*mark_set_inserter_t)(struct mark_set **s, struct object_id *oid, uintmax_t mark);
|
2020-02-22 21:17:48 +01:00
|
|
|
typedef void (*each_mark_fn_t)(uintmax_t mark, void *obj, void *cbp);
|
2020-02-22 21:17:46 +01:00
|
|
|
|
2007-01-16 06:33:19 +01:00
|
|
|
/* Configured limits on output */
|
2017-06-08 07:34:36 +02:00
|
|
|
static unsigned long max_depth = 50;
|
2010-02-17 20:05:54 +01:00
|
|
|
static off_t max_packsize;
|
2016-04-25 23:17:28 +02:00
|
|
|
static int unpack_limit = 100;
|
2007-02-06 22:08:06 +01:00
|
|
|
static int force_update;
|
2007-01-16 06:33:19 +01:00
|
|
|
|
|
|
|
/* Stats and misc. counters */
|
|
|
|
static uintmax_t alloc_count;
|
|
|
|
static uintmax_t marks_set_count;
|
|
|
|
static uintmax_t object_count_by_type[1 << TYPE_BITS];
|
|
|
|
static uintmax_t duplicate_count_by_type[1 << TYPE_BITS];
|
|
|
|
static uintmax_t delta_count_by_type[1 << TYPE_BITS];
|
2011-08-20 21:04:11 +02:00
|
|
|
static uintmax_t delta_count_attempts_by_type[1 << TYPE_BITS];
|
2007-01-16 10:55:41 +01:00
|
|
|
static unsigned long object_count;
|
2006-08-08 09:36:45 +02:00
|
|
|
static unsigned long branch_count;
|
2006-08-23 10:31:12 +02:00
|
|
|
static unsigned long branch_load_count;
|
2007-02-06 22:08:06 +01:00
|
|
|
static int failure;
|
2007-02-12 01:45:56 +01:00
|
|
|
static FILE *pack_edges;
|
2009-12-04 18:06:54 +01:00
|
|
|
static unsigned int show_stats = 1;
|
2009-12-04 18:06:57 +01:00
|
|
|
static int global_argc;
|
add an extra level of indirection to main()
There are certain startup tasks that we expect every git
process to do. In some cases this is just to improve the
quality of the program (e.g., setting up gettext()). In
others it is a requirement for using certain functions in
libgit.a (e.g., system_path() expects that you have called
git_extract_argv0_path()).
Most commands are builtins and are covered by the git.c
version of main(). However, there are still a few external
commands that use their own main(). Each of these has to
remember to include the correct startup sequence, and we are
not always consistent.
Rather than just fix the inconsistencies, let's make this
harder to get wrong by providing a common main() that can
run this standard startup.
We basically have two options to do this:
- the compat/mingw.h file already does something like this by
adding a #define that replaces the definition of main with a
wrapper that calls mingw_startup().
The upside is that the code in each program doesn't need
to be changed at all; it's rewritten on the fly by the
preprocessor.
The downside is that it may make debugging of the startup
sequence a bit more confusing, as the preprocessor is
quietly inserting new code.
- the builtin functions are all of the form cmd_foo(),
and git.c's main() calls them.
This is much more explicit, which may make things more
obvious to somebody reading the code. It's also more
flexible (because of course we have to figure out _which_
cmd_foo() to call).
The downside is that each of the builtins must define
cmd_foo(), instead of just main().
This patch chooses the latter option, preferring the more
explicit approach, even though it is more invasive. We
introduce a new file common-main.c, with the "real" main. It
expects to call cmd_main() from whatever other objects it is
linked against.
We link common-main.o against anything that links against
libgit.a, since we know that such programs will need to do
this setup. Note that common-main.o can't actually go inside
libgit.a, as the linker would not pick up its main()
function automatically (it has no callers).
The rest of the patch is just adjusting all of the various
external programs (mostly in t/helper) to use cmd_main().
I've provided a global declaration for cmd_main(), which
means that all of the programs also need to match its
signature. In particular, many functions need to switch to
"const char **" instead of "char **" for argv. This effect
ripples out to a few other variables and functions, as well.
This makes the patch even more invasive, but the end result
is much better. We should be treating argv strings as const
anyway, and now all programs conform to the same signature
(which also matches the way builtins are defined).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-01 07:58:58 +02:00
|
|
|
static const char **global_argv;
|
2006-08-08 06:46:13 +02:00
|
|
|
|
2006-08-14 06:58:19 +02:00
|
|
|
/* Memory pools */
|
2022-02-24 10:33:07 +01:00
|
|
|
static struct mem_pool fi_mem_pool = {
|
|
|
|
.block_alloc = 2*1024*1024 - sizeof(struct mp_block),
|
|
|
|
};
|
2006-08-14 06:58:19 +02:00
|
|
|
|
2006-08-15 02:16:28 +02:00
|
|
|
/* Atom management */
|
2006-08-14 06:58:19 +02:00
|
|
|
static unsigned int atom_table_sz = 4451;
|
|
|
|
static unsigned int atom_cnt;
|
|
|
|
static struct atom_str **atom_table;
|
|
|
|
|
|
|
|
/* The .pack file being generated */
|
2011-02-26 00:43:25 +01:00
|
|
|
static struct pack_idx_option pack_idx_opts;
|
2007-01-15 12:35:41 +01:00
|
|
|
static unsigned int pack_id;
|
2018-02-01 03:18:46 +01:00
|
|
|
static struct hashfile *pack_file;
|
2007-01-14 12:20:23 +01:00
|
|
|
static struct packed_git *pack_data;
|
2007-01-15 12:35:41 +01:00
|
|
|
static struct packed_git **all_packs;
|
2010-02-17 20:05:54 +01:00
|
|
|
static off_t pack_size;
|
2006-08-08 06:46:13 +02:00
|
|
|
|
|
|
|
/* Table of objects we've written. */
|
2006-08-28 18:22:50 +02:00
|
|
|
static unsigned int object_entry_alloc = 5000;
|
2006-08-14 06:58:19 +02:00
|
|
|
static struct object_entry_pool *blocks;
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
static struct hashmap object_table;
|
2006-08-23 10:17:45 +02:00
|
|
|
static struct mark_set *marks;
|
2009-12-04 18:06:55 +01:00
|
|
|
static const char *export_marks_file;
|
|
|
|
static const char *import_marks_file;
|
2009-12-04 18:06:59 +01:00
|
|
|
static int import_marks_file_from_stream;
|
2011-01-15 07:31:46 +01:00
|
|
|
static int import_marks_file_ignore_missing;
|
2016-05-17 23:40:23 +02:00
|
|
|
static int import_marks_file_done;
|
2009-12-04 18:07:00 +01:00
|
|
|
static int relative_marks_paths;
|
2006-08-08 06:46:13 +02:00
|
|
|
|
|
|
|
/* Our last blob */
|
2022-02-24 10:33:07 +01:00
|
|
|
static struct last_object last_blob = {
|
|
|
|
.data = STRBUF_INIT,
|
|
|
|
};
|
2006-08-14 06:58:19 +02:00
|
|
|
|
|
|
|
/* Tree management */
|
|
|
|
static unsigned int tree_entry_alloc = 1000;
|
|
|
|
static void *avail_tree_entry;
|
|
|
|
static unsigned int avail_tree_table_sz = 100;
|
|
|
|
static struct avail_tree_content **avail_tree_table;
|
2018-04-11 20:37:54 +02:00
|
|
|
static size_t tree_entry_allocd;
|
2007-09-17 13:48:17 +02:00
|
|
|
static struct strbuf old_tree = STRBUF_INIT;
|
|
|
|
static struct strbuf new_tree = STRBUF_INIT;
|
2006-08-06 19:51:39 +02:00
|
|
|
|
2006-08-08 09:36:45 +02:00
|
|
|
/* Branch data */
|
2006-08-23 08:00:31 +02:00
|
|
|
static unsigned long max_active_branches = 5;
|
|
|
|
static unsigned long cur_active_branches;
|
|
|
|
static unsigned long branch_table_sz = 1039;
|
2006-08-14 06:58:19 +02:00
|
|
|
static struct branch **branch_table;
|
|
|
|
static struct branch *active_branches;
|
|
|
|
|
2006-08-24 09:12:13 +02:00
|
|
|
/* Tag data */
|
|
|
|
static struct tag *first_tag;
|
|
|
|
static struct tag *last_tag;
|
|
|
|
|
2006-08-15 02:16:28 +02:00
|
|
|
/* Input stream parsing */
|
2007-02-06 20:58:30 +01:00
|
|
|
static whenspec_type whenspec = WHENSPEC_RAW;
|
2007-09-06 13:20:07 +02:00
|
|
|
static struct strbuf command_buf = STRBUF_INIT;
|
2007-08-01 08:22:53 +02:00
|
|
|
static int unread_command_buf;
|
2022-02-24 10:33:07 +01:00
|
|
|
static struct recent_command cmd_hist = {
|
|
|
|
.prev = &cmd_hist,
|
|
|
|
.next = &cmd_hist,
|
|
|
|
};
|
2007-08-03 10:47:04 +02:00
|
|
|
static struct recent_command *cmd_tail = &cmd_hist;
|
|
|
|
static struct recent_command *rc_free;
|
|
|
|
static unsigned int cmd_save = 100;
|
2007-01-16 06:33:19 +01:00
|
|
|
static uintmax_t next_mark;
|
2007-09-17 13:48:17 +02:00
|
|
|
static struct strbuf new_data = STRBUF_INIT;
|
2009-12-04 18:06:56 +01:00
|
|
|
static int seen_data_command;
|
2011-07-16 15:03:32 +02:00
|
|
|
static int require_explicit_termination;
|
fast-import: disallow "feature export-marks" by default
The fast-import stream command "feature export-marks=<path>" lets the
stream write marks to an arbitrary path. This may be surprising if you
are running fast-import against an untrusted input (which otherwise
cannot do anything except update Git objects and refs).
Let's disallow the use of this feature by default, and provide a
command-line option to re-enable it (you can always just use the
command-line --export-marks as well, but the in-stream version provides
an easy way for exporters to control the process).
This is a backwards-incompatible change, since the default is flipping
to the new, safer behavior. However, since the main users of the
in-stream versions would be import/export-based remote helpers, and
since we trust remote helpers already (which are already running
arbitrary code), we'll pass the new option by default when reading a
remote helper's stream. This should minimize the impact.
Note that the implementation isn't totally simple, as we have to work
around the fact that fast-import doesn't parse its command-line options
until after it has read any "feature" lines from the stream. This is how
it lets command-line options override in-stream. But in our case, it's
important to parse the new --allow-unsafe-features first.
There are three options for resolving this:
1. Do a separate "early" pass over the options. This is easy for us to
do because there are no command-line options that allow the
"unstuck" form (so there's no chance of us mistaking an argument
for an option), though it does introduce a risk of incorrect
parsing later (e.g,. if we convert to parse-options).
2. Move the option parsing phase back to the start of the program, but
teach the stream-reading code never to override an existing value.
This is tricky, because stream "feature" lines override each other
(meaning we'd have to start tracking the source for every option).
3. Accept that we might parse a "feature export-marks" line that is
forbidden, as long we don't _act_ on it until after we've parsed
the command line options.
This would, in fact, work with the current code, but only because
the previous patch fixed the export-marks parser to avoid touching
the filesystem.
So while it works, it does carry risk of somebody getting it wrong
in the future in a rather subtle and unsafe way.
I've gone with option (1) here as simple, safe, and unlikely to cause
regressions.
This fixes CVE-2019-1348.
Signed-off-by: Jeff King <peff@peff.net>
2019-08-29 20:37:26 +02:00
|
|
|
static int allow_unsafe_features;
|
2006-08-15 02:16:28 +02:00
|
|
|
|
2010-11-22 09:16:02 +01:00
|
|
|
/* Signal handling */
|
|
|
|
static volatile sig_atomic_t checkpoint_requested;
|
|
|
|
|
fast-import: add options for rewriting submodules
When converting a repository using submodules from one hash algorithm to
another, it is necessary to rewrite the submodules from the old
algorithm to the new algorithm, since only references to submodules, not
their contents, are written to the fast-export stream. Without rewriting
the submodules, fast-import fails with an "Invalid dataref" error when
encountering a submodule in another algorithm.
Add a pair of options, --rewrite-submodules-from and
--rewrite-submodules-to, that take a list of marks produced by
fast-export and fast-import, respectively, when processing the
submodule. Use these marks to map the submodule commits from the old
algorithm to the new algorithm.
We read marks into two corresponding struct mark_set objects and then
perform a mapping from the old to the new using a hash table. This lets
us reuse the same mark parsing code that is used elsewhere and allows us
to efficiently read and match marks based on their ID, since mark files
need not be sorted.
Note that because we're using a khash table for the object IDs, and this
table copies values of struct object_id instead of taking references to
them, it's necessary to zero the struct object_id values that we use to
insert and look up in the table. Otherwise, we would end up with SHA-1
values that don't match because of whatever stack garbage might be left
in the unused area.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-22 21:17:49 +01:00
|
|
|
/* Submodule marks */
|
|
|
|
static struct string_list sub_marks_from = STRING_LIST_INIT_DUP;
|
|
|
|
static struct string_list sub_marks_to = STRING_LIST_INIT_DUP;
|
|
|
|
static kh_oid_map_t *sub_oid_map;
|
|
|
|
|
2010-11-28 20:45:01 +01:00
|
|
|
/* Where to write output of cat-blob commands */
|
|
|
|
static int cat_blob_fd = STDOUT_FILENO;
|
|
|
|
|
2009-12-04 18:06:57 +01:00
|
|
|
static void parse_argv(void);
|
2015-07-01 17:05:58 +02:00
|
|
|
static void parse_get_mark(const char *p);
|
2014-06-18 21:49:12 +02:00
|
|
|
static void parse_cat_blob(const char *p);
|
|
|
|
static void parse_ls(const char *p, struct branch *b);
|
2006-08-15 02:16:28 +02:00
|
|
|
|
2020-02-22 21:17:48 +01:00
|
|
|
static void for_each_mark(struct mark_set *m, uintmax_t base, each_mark_fn_t callback, void *p)
|
|
|
|
{
|
|
|
|
uintmax_t k;
|
|
|
|
if (m->shift) {
|
|
|
|
for (k = 0; k < 1024; k++) {
|
|
|
|
if (m->data.sets[k])
|
|
|
|
for_each_mark(m->data.sets[k], base + (k << m->shift), callback, p);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (k = 0; k < 1024; k++) {
|
|
|
|
if (m->data.marked[k])
|
|
|
|
callback(base + k, m->data.marked[k], p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dump_marks_fn(uintmax_t mark, void *object, void *cbp) {
|
|
|
|
struct object_entry *e = object;
|
|
|
|
FILE *f = cbp;
|
|
|
|
|
|
|
|
fprintf(f, ":%" PRIuMAX " %s\n", mark, oid_to_hex(&e->idx.oid));
|
|
|
|
}
|
|
|
|
|
2007-08-03 08:00:37 +02:00
|
|
|
static void write_branch_report(FILE *rpt, struct branch *b)
|
|
|
|
{
|
|
|
|
fprintf(rpt, "%s:\n", b->name);
|
|
|
|
|
|
|
|
fprintf(rpt, " status :");
|
|
|
|
if (b->active)
|
|
|
|
fputs(" active", rpt);
|
|
|
|
if (b->branch_tree.tree)
|
|
|
|
fputs(" loaded", rpt);
|
2017-05-01 04:29:03 +02:00
|
|
|
if (is_null_oid(&b->branch_tree.versions[1].oid))
|
2007-08-03 08:00:37 +02:00
|
|
|
fputs(" dirty", rpt);
|
|
|
|
fputc('\n', rpt);
|
|
|
|
|
2017-05-01 04:29:03 +02:00
|
|
|
fprintf(rpt, " tip commit : %s\n", oid_to_hex(&b->oid));
|
|
|
|
fprintf(rpt, " old tree : %s\n",
|
|
|
|
oid_to_hex(&b->branch_tree.versions[0].oid));
|
|
|
|
fprintf(rpt, " cur tree : %s\n",
|
|
|
|
oid_to_hex(&b->branch_tree.versions[1].oid));
|
2007-08-03 08:00:37 +02:00
|
|
|
fprintf(rpt, " commit clock: %" PRIuMAX "\n", b->last_commit);
|
|
|
|
|
|
|
|
fputs(" last pack : ", rpt);
|
|
|
|
if (b->pack_id < MAX_PACK_ID)
|
|
|
|
fprintf(rpt, "%u", b->pack_id);
|
|
|
|
fputc('\n', rpt);
|
|
|
|
|
|
|
|
fputc('\n', rpt);
|
|
|
|
}
|
|
|
|
|
2007-08-21 05:38:14 +02:00
|
|
|
static void write_crash_report(const char *err)
|
2007-08-03 08:00:37 +02:00
|
|
|
{
|
2015-08-10 11:35:31 +02:00
|
|
|
char *loc = git_pathdup("fast_import_crash_%"PRIuMAX, (uintmax_t) getpid());
|
2007-08-03 08:00:37 +02:00
|
|
|
FILE *rpt = fopen(loc, "w");
|
|
|
|
struct branch *b;
|
|
|
|
unsigned long lu;
|
2007-08-03 10:47:04 +02:00
|
|
|
struct recent_command *rc;
|
2007-08-03 08:00:37 +02:00
|
|
|
|
|
|
|
if (!rpt) {
|
2016-05-08 11:47:45 +02:00
|
|
|
error_errno("can't write crash report %s", loc);
|
2015-08-10 11:35:31 +02:00
|
|
|
free(loc);
|
2007-08-03 08:00:37 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(stderr, "fast-import: dumping crash report to %s\n", loc);
|
|
|
|
|
|
|
|
fprintf(rpt, "fast-import crash report:\n");
|
2008-08-31 14:09:39 +02:00
|
|
|
fprintf(rpt, " fast-import process: %"PRIuMAX"\n", (uintmax_t) getpid());
|
|
|
|
fprintf(rpt, " parent process : %"PRIuMAX"\n", (uintmax_t) getppid());
|
2015-09-03 23:48:55 +02:00
|
|
|
fprintf(rpt, " at %s\n", show_date(time(NULL), 0, DATE_MODE(ISO8601)));
|
2007-08-03 08:00:37 +02:00
|
|
|
fputc('\n', rpt);
|
|
|
|
|
|
|
|
fputs("fatal: ", rpt);
|
2007-08-21 05:38:14 +02:00
|
|
|
fputs(err, rpt);
|
2007-08-03 08:00:37 +02:00
|
|
|
fputc('\n', rpt);
|
|
|
|
|
2007-08-03 10:47:04 +02:00
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs("Most Recent Commands Before Crash\n", rpt);
|
|
|
|
fputs("---------------------------------\n", rpt);
|
|
|
|
for (rc = cmd_hist.next; rc != &cmd_hist; rc = rc->next) {
|
|
|
|
if (rc->next == &cmd_hist)
|
|
|
|
fputs("* ", rpt);
|
|
|
|
else
|
|
|
|
fputs(" ", rpt);
|
|
|
|
fputs(rc->buf, rpt);
|
|
|
|
fputc('\n', rpt);
|
|
|
|
}
|
|
|
|
|
2007-08-03 08:00:37 +02:00
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs("Active Branch LRU\n", rpt);
|
|
|
|
fputs("-----------------\n", rpt);
|
|
|
|
fprintf(rpt, " active_branches = %lu cur, %lu max\n",
|
|
|
|
cur_active_branches,
|
|
|
|
max_active_branches);
|
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs(" pos clock name\n", rpt);
|
|
|
|
fputs(" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", rpt);
|
|
|
|
for (b = active_branches, lu = 0; b; b = b->active_next_branch)
|
|
|
|
fprintf(rpt, " %2lu) %6" PRIuMAX" %s\n",
|
|
|
|
++lu, b->last_commit, b->name);
|
|
|
|
|
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs("Inactive Branches\n", rpt);
|
|
|
|
fputs("-----------------\n", rpt);
|
|
|
|
for (lu = 0; lu < branch_table_sz; lu++) {
|
|
|
|
for (b = branch_table[lu]; b; b = b->table_next_branch)
|
|
|
|
write_branch_report(rpt, b);
|
|
|
|
}
|
|
|
|
|
2008-02-14 07:34:36 +01:00
|
|
|
if (first_tag) {
|
|
|
|
struct tag *tg;
|
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs("Annotated Tags\n", rpt);
|
|
|
|
fputs("--------------\n", rpt);
|
|
|
|
for (tg = first_tag; tg; tg = tg->next_tag) {
|
2017-05-01 04:29:03 +02:00
|
|
|
fputs(oid_to_hex(&tg->oid), rpt);
|
2008-02-14 07:34:36 +01:00
|
|
|
fputc(' ', rpt);
|
|
|
|
fputs(tg->name, rpt);
|
|
|
|
fputc('\n', rpt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-02-14 07:34:40 +01:00
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs("Marks\n", rpt);
|
|
|
|
fputs("-----\n", rpt);
|
2009-12-04 18:06:55 +01:00
|
|
|
if (export_marks_file)
|
|
|
|
fprintf(rpt, " exported to %s\n", export_marks_file);
|
2008-02-14 07:34:40 +01:00
|
|
|
else
|
2020-02-22 21:17:48 +01:00
|
|
|
for_each_mark(marks, 0, dump_marks_fn, rpt);
|
2008-02-14 07:34:40 +01:00
|
|
|
|
2007-08-03 08:00:37 +02:00
|
|
|
fputc('\n', rpt);
|
|
|
|
fputs("-------------------\n", rpt);
|
|
|
|
fputs("END OF CRASH REPORT\n", rpt);
|
|
|
|
fclose(rpt);
|
2015-08-10 11:35:31 +02:00
|
|
|
free(loc);
|
2007-08-03 08:00:37 +02:00
|
|
|
}
|
|
|
|
|
2008-02-14 07:34:43 +01:00
|
|
|
static void end_packfile(void);
|
|
|
|
static void unkeep_all_packs(void);
|
|
|
|
static void dump_marks(void);
|
|
|
|
|
2007-08-03 08:00:37 +02:00
|
|
|
static NORETURN void die_nicely(const char *err, va_list params)
|
|
|
|
{
|
2021-12-07 19:26:30 +01:00
|
|
|
va_list cp;
|
2007-08-03 08:00:37 +02:00
|
|
|
static int zombie;
|
2021-12-07 19:26:30 +01:00
|
|
|
report_fn die_message_fn = get_die_message_routine();
|
2007-08-03 08:00:37 +02:00
|
|
|
|
2021-12-07 19:26:30 +01:00
|
|
|
va_copy(cp, params);
|
|
|
|
die_message_fn(err, params);
|
2007-08-03 08:00:37 +02:00
|
|
|
|
|
|
|
if (!zombie) {
|
2021-12-07 19:26:30 +01:00
|
|
|
char message[2 * PATH_MAX];
|
|
|
|
|
2007-08-03 08:00:37 +02:00
|
|
|
zombie = 1;
|
2021-12-07 19:26:30 +01:00
|
|
|
vsnprintf(message, sizeof(message), err, cp);
|
2007-08-21 05:38:14 +02:00
|
|
|
write_crash_report(message);
|
2008-02-14 07:34:43 +01:00
|
|
|
end_packfile();
|
|
|
|
unkeep_all_packs();
|
|
|
|
dump_marks();
|
2007-08-03 08:00:37 +02:00
|
|
|
}
|
|
|
|
exit(128);
|
|
|
|
}
|
2006-08-08 09:36:45 +02:00
|
|
|
|
2010-11-22 09:16:02 +01:00
|
|
|
#ifndef SIGUSR1 /* Windows, for example */
|
|
|
|
|
|
|
|
static void set_checkpoint_signal(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static void checkpoint_signal(int signo)
|
|
|
|
{
|
|
|
|
checkpoint_requested = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_checkpoint_signal(void)
|
|
|
|
{
|
|
|
|
struct sigaction sa;
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof(sa));
|
|
|
|
sa.sa_handler = checkpoint_signal;
|
|
|
|
sigemptyset(&sa.sa_mask);
|
|
|
|
sa.sa_flags = SA_RESTART;
|
|
|
|
sigaction(SIGUSR1, &sa, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2007-01-15 06:16:23 +01:00
|
|
|
static void alloc_objects(unsigned int cnt)
|
2006-08-06 19:51:39 +02:00
|
|
|
{
|
2006-08-14 06:58:19 +02:00
|
|
|
struct object_entry_pool *b;
|
2006-08-08 06:03:59 +02:00
|
|
|
|
2006-08-14 06:58:19 +02:00
|
|
|
b = xmalloc(sizeof(struct object_entry_pool)
|
2006-08-08 06:03:59 +02:00
|
|
|
+ cnt * sizeof(struct object_entry));
|
2006-08-14 06:58:19 +02:00
|
|
|
b->next_pool = blocks;
|
2006-08-08 06:03:59 +02:00
|
|
|
b->next_free = b->entries;
|
|
|
|
b->end = b->entries + cnt;
|
|
|
|
blocks = b;
|
|
|
|
alloc_count += cnt;
|
|
|
|
}
|
2006-08-06 19:51:39 +02:00
|
|
|
|
2017-05-07 00:09:56 +02:00
|
|
|
static struct object_entry *new_object(struct object_id *oid)
|
2006-08-06 19:51:39 +02:00
|
|
|
{
|
2006-08-08 06:03:59 +02:00
|
|
|
struct object_entry *e;
|
2006-08-06 19:51:39 +02:00
|
|
|
|
2006-08-08 06:03:59 +02:00
|
|
|
if (blocks->next_free == blocks->end)
|
2006-08-14 06:58:19 +02:00
|
|
|
alloc_objects(object_entry_alloc);
|
2006-08-06 19:51:39 +02:00
|
|
|
|
2006-08-08 06:03:59 +02:00
|
|
|
e = blocks->next_free++;
|
2017-05-07 00:10:11 +02:00
|
|
|
oidcpy(&e->idx.oid, oid);
|
2006-08-08 06:03:59 +02:00
|
|
|
return e;
|
2006-08-06 19:51:39 +02:00
|
|
|
}
|
|
|
|
|
2017-05-07 00:09:56 +02:00
|
|
|
static struct object_entry *find_object(struct object_id *oid)
|
2006-08-14 06:58:19 +02:00
|
|
|
{
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
return hashmap_get_entry_from_hash(&object_table, oidhash(oid), oid,
|
|
|
|
struct object_entry, ent);
|
2006-08-14 06:58:19 +02:00
|
|
|
}
|
|
|
|
|
2017-05-07 00:09:56 +02:00
|
|
|
static struct object_entry *insert_object(struct object_id *oid)
|
2006-08-06 19:51:39 +02:00
|
|
|
{
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
struct object_entry *e;
|
|
|
|
unsigned int hash = oidhash(oid);
|
2006-08-06 19:51:39 +02:00
|
|
|
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
e = hashmap_get_entry_from_hash(&object_table, hash, oid,
|
|
|
|
struct object_entry, ent);
|
|
|
|
if (!e) {
|
|
|
|
e = new_object(oid);
|
|
|
|
e->idx.offset = 0;
|
|
|
|
hashmap_entry_init(&e->ent, hash);
|
|
|
|
hashmap_add(&object_table, &e->ent);
|
2006-08-06 19:51:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return e;
|
|
|
|
}
|
2006-08-05 08:04:21 +02:00
|
|
|
|
2016-05-26 00:54:02 +02:00
|
|
|
static void invalidate_pack_id(unsigned int id)
|
|
|
|
{
|
|
|
|
unsigned long lu;
|
|
|
|
struct tag *t;
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
struct hashmap_iter iter;
|
|
|
|
struct object_entry *e;
|
2016-05-26 00:54:02 +02:00
|
|
|
|
fast-import: replace custom hash with hashmap.c
We use a custom hash in fast-import to store the set of objects we've
imported so far. It has a fixed set of 2^16 buckets and chains any
collisions with a linked list. As the number of objects grows larger
than that, the load factor increases and we degrade to O(n) lookups and
O(n^2) insertions.
We can scale better by using our hashmap.c implementation, which will
resize the bucket count as we grow. This does incur an extra memory cost
of 8 bytes per object, as hashmap stores the integer hash value for each
entry in its hashmap_entry struct (which we really don't care about
here, because we're just reusing the embedded object hash). But I think
the numbers below justify this (and our per-object memory cost is
already much higher).
I also looked at using khash, but it seemed to perform slightly worse
than hashmap at all sizes, and worse even than the existing code for
small sizes. It's also awkward to use here, because we want to look up a
"struct object_entry" from a "struct object_id", and it doesn't handle
mismatched keys as well. Making a mapping of object_id to object_entry
would be more natural, but that would require pulling the embedded oid
out of the object_entry or incurring an extra 32 bytes per object.
In a synthetic test creating as many cheap, tiny objects as possible
perl -e '
my $bits = shift;
my $nr = 2**$bits;
for (my $i = 0; $i < $nr; $i++) {
print "blob\n";
print "data 4\n";
print pack("N", $i);
}
' $bits | git fast-import
I got these results:
nr_objects master khash hashmap
2^20 0m4.317s 0m5.109s 0m3.890s
2^21 0m10.204s 0m9.702s 0m7.933s
2^22 0m27.159s 0m17.911s 0m16.751s
2^23 1m19.038s 0m35.080s 0m31.963s
2^24 4m18.766s 1m10.233s 1m6.793s
which points to hashmap as the winner. We didn't have any perf tests for
fast-export or fast-import, so I added one as a more real-world case.
It uses an export without blobs since that's significantly cheaper than
a full one, but still is an interesting case people might use (e.g., for
rewriting history). It will emphasize this change in some ways (as a
percentage we spend more time making objects and less shuffling blob
bytes around) and less in others (the total object count is lower).
Here are the results for linux.git:
Test HEAD^ HEAD
----------------------------------------------------------------------------
9300.1: export (no-blobs) 67.64(66.96+0.67) 67.81(67.06+0.75) +0.3%
9300.2: import (no-blobs) 284.04(283.34+0.69) 198.09(196.01+0.92) -30.3%
It only has ~5.2M commits and trees, so this is a larger effect than I
expected (the 2^23 case above only improved by 50s or so, but here we
gained almost 90s). This is probably due to actually performing more
object lookups in a real import with trees and commits, as opposed to
just dumping a bunch of blobs into a pack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-06 21:49:40 +02:00
|
|
|
hashmap_for_each_entry(&object_table, &iter, e, ent) {
|
|
|
|
if (e->pack_id == id)
|
|
|
|
e->pack_id = MAX_PACK_ID;
|
2016-05-26 00:54:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for (lu = 0; lu < branch_table_sz; lu++) {
|
|
|
|
struct branch *b;
|
|
|
|
|
|
|
|
for (b = branch_table[lu]; b; b = b->table_next_branch)
|
|
|
|
if (b->pack_id == id)
|
|
|
|
b->pack_id = MAX_PACK_ID;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (t = first_tag; t; t = t->next_tag)
|
|
|
|
if (t->pack_id == id)
|
|
|
|
t->pack_id = MAX_PACK_ID;
|
|
|
|
}
|
|
|
|
|
2006-08-14 06:58:19 +02:00
|
|
|
static unsigned int hc_str(const char *s, size_t len)
|
|
|
|
{
|
|
|
|
unsigned int r = 0;
|
|
|
|
while (len-- > 0)
|
|
|
|
r = r * 31 + *s++;
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
fast-import: fix over-allocation of marks storage
Fast-import stores its marks in a trie-like structure made of mark_set
structs. Each struct has a fixed size (1024). If our id number is too
large to fit in the struct, then we allocate a new struct which shifts
the id number by 10 bits. Our original struct becomes a child node
of this new layer, and the new struct becomes the top level of the trie.
This scheme was broken by ddddf8d7e2 (fast-import: permit reading
multiple marks files, 2020-02-22). Before then, we had a top-level
"marks" pointer, and the push-down worked by assigning the new top-level
struct to "marks". But after that commit, insert_mark() takes a pointer
to the mark_set, rather than using the global "marks". It continued to
assign to the global "marks" variable during the push down, which was
wrong for two reasons:
- we added a call in option_rewrite_submodules() which uses a separate
mark set; pushing down on "marks" is outright wrong here. We'd
corrupt the "marks" set, and we'd fail to correctly store any
submodule mappings with an id over 1024.
- the other callers passed "marks", but the push-down was still wrong.
In read_mark_file(), we take the pointer to the mark_set as a
parameter. So even though insert_mark() was updating the global
"marks", the local pointer we had in read_mark_file() was not
updated. As a result, we'd add a new level when needed, but then the
next call to insert_mark() wouldn't see it! It would then allocate a
new layer, which would also not be seen, and so on. Lookups for the
lost layers obviously wouldn't work, but before we even hit any
lookup stage, we'd generally run out of memory and die.
Our tests didn't notice either of these cases because they didn't have
enough marks to trigger the push-down behavior. The new tests in t9304
cover both cases (and fail without this patch).
We can solve the problem by having insert_mark() take a pointer-to-pointer
of the top-level of the set. Then our push down can assign to it in a
way that the caller actually sees. Note the subtle reordering in
option_rewrite_submodules(). Our call to read_mark_file() may modify our
top-level set pointer, so we have to wait until after it returns to
assign its value into the string_list.
Reported-by: Sergey Brester <serg.brester@sebres.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-10-15 17:38:49 +02:00
|
|
|
static void insert_mark(struct mark_set **top, uintmax_t idnum, struct object_entry *oe)
|
2006-08-23 10:17:45 +02:00
|
|
|
{
|
fast-import: fix over-allocation of marks storage
Fast-import stores its marks in a trie-like structure made of mark_set
structs. Each struct has a fixed size (1024). If our id number is too
large to fit in the struct, then we allocate a new struct which shifts
the id number by 10 bits. Our original struct becomes a child node
of this new layer, and the new struct becomes the top level of the trie.
This scheme was broken by ddddf8d7e2 (fast-import: permit reading
multiple marks files, 2020-02-22). Before then, we had a top-level
"marks" pointer, and the push-down worked by assigning the new top-level
struct to "marks". But after that commit, insert_mark() takes a pointer
to the mark_set, rather than using the global "marks". It continued to
assign to the global "marks" variable during the push down, which was
wrong for two reasons:
- we added a call in option_rewrite_submodules() which uses a separate
mark set; pushing down on "marks" is outright wrong here. We'd
corrupt the "marks" set, and we'd fail to correctly store any
submodule mappings with an id over 1024.
- the other callers passed "marks", but the push-down was still wrong.
In read_mark_file(), we take the pointer to the mark_set as a
parameter. So even though insert_mark() was updating the global
"marks", the local pointer we had in read_mark_file() was not
updated. As a result, we'd add a new level when needed, but then the
next call to insert_mark() wouldn't see it! It would then allocate a
new layer, which would also not be seen, and so on. Lookups for the
lost layers obviously wouldn't work, but before we even hit any
lookup stage, we'd generally run out of memory and die.
Our tests didn't notice either of these cases because they didn't have
enough marks to trigger the push-down behavior. The new tests in t9304
cover both cases (and fail without this patch).
We can solve the problem by having insert_mark() take a pointer-to-pointer
of the top-level of the set. Then our push down can assign to it in a
way that the caller actually sees. Note the subtle reordering in
option_rewrite_submodules(). Our call to read_mark_file() may modify our
top-level set pointer, so we have to wait until after it returns to
assign its value into the string_list.
Reported-by: Sergey Brester <serg.brester@sebres.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-10-15 17:38:49 +02:00
|
|
|
struct mark_set *s = *top;
|
|
|
|
|
2006-08-23 10:17:45 +02:00
|
|
|
while ((idnum >> s->shift) >= 1024) {
|
2018-04-11 20:37:54 +02:00
|
|
|
s = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
|
fast-import: fix over-allocation of marks storage
Fast-import stores its marks in a trie-like structure made of mark_set
structs. Each struct has a fixed size (1024). If our id number is too
large to fit in the struct, then we allocate a new struct which shifts
the id number by 10 bits. Our original struct becomes a child node
of this new layer, and the new struct becomes the top level of the trie.
This scheme was broken by ddddf8d7e2 (fast-import: permit reading
multiple marks files, 2020-02-22). Before then, we had a top-level
"marks" pointer, and the push-down worked by assigning the new top-level
struct to "marks". But after that commit, insert_mark() takes a pointer
to the mark_set, rather than using the global "marks". It continued to
assign to the global "marks" variable during the push down, which was
wrong for two reasons:
- we added a call in option_rewrite_submodules() which uses a separate
mark set; pushing down on "marks" is outright wrong here. We'd
corrupt the "marks" set, and we'd fail to correctly store any
submodule mappings with an id over 1024.
- the other callers passed "marks", but the push-down was still wrong.
In read_mark_file(), we take the pointer to the mark_set as a
parameter. So even though insert_mark() was updating the global
"marks", the local pointer we had in read_mark_file() was not
updated. As a result, we'd add a new level when needed, but then the
next call to insert_mark() wouldn't see it! It would then allocate a
new layer, which would also not be seen, and so on. Lookups for the
lost layers obviously wouldn't work, but before we even hit any
lookup stage, we'd generally run out of memory and die.
Our tests didn't notice either of these cases because they didn't have
enough marks to trigger the push-down behavior. The new tests in t9304
cover both cases (and fail without this patch).
We can solve the problem by having insert_mark() take a pointer-to-pointer
of the top-level of the set. Then our push down can assign to it in a
way that the caller actually sees. Note the subtle reordering in
option_rewrite_submodules(). Our call to read_mark_file() may modify our
top-level set pointer, so we have to wait until after it returns to
assign its value into the string_list.
Reported-by: Sergey Brester <serg.brester@sebres.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-10-15 17:38:49 +02:00
|
|
|
s->shift = (*top)->shift + 10;
|
|
|
|
s->data.sets[0] = *top;
|
|
|
|
*top = s;
|
2006-08-23 10:17:45 +02:00
|
|
|
}
|
|
|
|
while (s->shift) {
|
2007-01-16 06:33:19 +01:00
|
|
|
uintmax_t i = idnum >> s->shift;
|
2006-08-23 10:17:45 +02:00
|
|
|
idnum -= i << s->shift;
|
|
|
|
if (!s->data.sets[i]) {
|
2018-04-11 20:37:54 +02:00
|
|
|
s->data.sets[i] = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
|
2006-08-23 10:17:45 +02:00
|
|
|
s->data.sets[i]->shift = s->shift - 10;
|
|
|
|
}
|
|
|
|
s = s->data.sets[i];
|
|
|
|
}
|
|
|
|
if (!s->data.marked[idnum])
|
|
|
|
marks_set_count++;
|
|
|
|
s->data.marked[idnum] = oe;
|
|
|
|
}
|
|
|
|
|
2020-02-22 21:17:47 +01:00
|
|
|
static void *find_mark(struct mark_set *s, uintmax_t idnum)
|
2006-08-23 10:17:45 +02:00
|
|
|
{
|
2007-01-16 06:33:19 +01:00
|
|
|
uintmax_t orig_idnum = idnum;
|
2006-08-23 10:17:45 +02:00
|
|
|
struct object_entry *oe = NULL;
|
|
|
|
if ((idnum >> s->shift) < 1024) {
|
|
|
|
while (s && s->shift) {
|
2007-01-16 06:33:19 +01:00
|
|
|
uintmax_t i = idnum >> s->shift;
|
2006-08-23 10:17:45 +02:00
|
|
|
idnum -= i << s->shift;
|
|
|
|
s = s->data.sets[i];
|
|
|
|
}
|
|
|
|
if (s)
|
|
|
|
oe = s->data.marked[idnum];
|
|
|
|
}
|
|
|
|
if (!oe)
|
2007-02-21 02:34:56 +01:00
|
|
|
die("mark :%" |