Git Source Code Mirror - This is a publish-only repository and all pull requests are ignored. Please follow Documentation/SubmittingPatches procedure for any of your improvements. https://git-scm.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
git/packfile.c

2284 lines
58 KiB

#include "cache.h"
#include "list.h"
#include "pack.h"
#include "repository.h"
#include "dir.h"
#include "mergesort.h"
#include "packfile.h"
#include "delta.h"
#include "streaming.h"
#include "hash-lookup.h"
#include "commit.h"
#include "object.h"
#include "tag.h"
#include "tree-walk.h"
#include "tree.h"
#include "object-store.h"
#include "midx.h"
#include "commit-graph.h"
#include "promisor-remote.h"
char *odb_pack_name(struct strbuf *buf,
const unsigned char *hash,
const char *ext)
{
strbuf_reset(buf);
strbuf_addf(buf, "%s/pack/pack-%s.%s", get_object_directory(),
hash_to_hex(hash), ext);
return buf->buf;
}
char *sha1_pack_name(const unsigned char *sha1)
{
static struct strbuf buf = STRBUF_INIT;
return odb_pack_name(&buf, sha1, "pack");
}
char *sha1_pack_index_name(const unsigned char *sha1)
{
static struct strbuf buf = STRBUF_INIT;
return odb_pack_name(&buf, sha1, "idx");
}
static unsigned int pack_used_ctr;
static unsigned int pack_mmap_calls;
static unsigned int peak_pack_open_windows;
static unsigned int pack_open_windows;
static unsigned int pack_open_fds;
static unsigned int pack_max_fds;
static size_t peak_pack_mapped;
static size_t pack_mapped;
#define SZ_FMT PRIuMAX
static inline uintmax_t sz_fmt(size_t s) { return s; }
void pack_report(void)
{
fprintf(stderr,
"pack_report: getpagesize() = %10" SZ_FMT "\n"
"pack_report: core.packedGitWindowSize = %10" SZ_FMT "\n"
"pack_report: core.packedGitLimit = %10" SZ_FMT "\n",
sz_fmt(getpagesize()),
sz_fmt(packed_git_window_size),
sz_fmt(packed_git_limit));
fprintf(stderr,
"pack_report: pack_used_ctr = %10u\n"
"pack_report: pack_mmap_calls = %10u\n"
"pack_report: pack_open_windows = %10u / %10u\n"
"pack_report: pack_mapped = "
"%10" SZ_FMT " / %10" SZ_FMT "\n",
pack_used_ctr,
pack_mmap_calls,
pack_open_windows, peak_pack_open_windows,
sz_fmt(pack_mapped), sz_fmt(peak_pack_mapped));
}
/*
* Open and mmap the index file at path, perform a couple of
* consistency checks, then record its information to p. Return 0 on
* success.
*/
static int check_packed_git_idx(const char *path, struct packed_git *p)
{
void *idx_map;
size_t idx_size;
int fd = git_open(path), ret;
struct stat st;
const unsigned int hashsz = the_hash_algo->rawsz;
if (fd < 0)
return -1;
if (fstat(fd, &st)) {
close(fd);
return -1;
}
idx_size = xsize_t(st.st_size);
if (idx_size < 4 * 256 + hashsz + hashsz) {
close(fd);
return error("index file %s is too small", path);
}
idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
ret = load_idx(path, hashsz, idx_map, idx_size, p);
if (ret)
munmap(idx_map, idx_size);
return ret;
}
int load_idx(const char *path, const unsigned int hashsz, void *idx_map,
size_t idx_size, struct packed_git *p)
{
struct pack_idx_header *hdr = idx_map;
uint32_t version, nr, i, *index;
if (idx_size < 4 * 256 + hashsz + hashsz)
return error("index file %s is too small", path);
if (!idx_map)
return error("empty data");
if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) {
version = ntohl(hdr->idx_version);
if (version < 2 || version > 2)
return error("index file %s is version %"PRIu32
" and is not supported by this binary"
" (try upgrading GIT to a newer version)",
path, version);
} else
version = 1;
nr = 0;
index = idx_map;
if (version > 1)
index += 2; /* skip index header */
for (i = 0; i < 256; i++) {
uint32_t n = ntohl(index[i]);
if (n < nr)
return error("non-monotonic index %s", path);
nr = n;
}
if (version == 1) {
/*
* Total size:
* - 256 index entries 4 bytes each
* - 24-byte entries * nr (object ID + 4-byte offset)
* - hash of the packfile
* - file checksum
*/
if (idx_size != st_add(4 * 256 + hashsz + hashsz, st_mult(nr, hashsz + 4)))
return error("wrong index v1 file size in %s", path);
} else if (version == 2) {
/*
* Minimum size:
* - 8 bytes of header
* - 256 index entries 4 bytes each
* - object ID entry * nr
* - 4-byte crc entry * nr
* - 4-byte offset entry * nr
* - hash of the packfile
* - file checksum
* And after the 4-byte offset table might be a
* variable sized table containing 8-byte entries
* for offsets larger than 2^31.
*/
size_t min_size = st_add(8 + 4*256 + hashsz + hashsz, st_mult(nr, hashsz + 4 + 4));
size_t max_size = min_size;
if (nr)
max_size = st_add(max_size, st_mult(nr - 1, 8));
if (idx_size < min_size || idx_size > max_size)
return error("wrong index v2 file size in %s", path);
if (idx_size != min_size &&
/*
* make sure we can deal with large pack offsets.
* 31-bit signed offset won't be enough, neither
* 32-bit unsigned one will be.
*/
(sizeof(off_t) <= 4))
return error("pack too large for current definition of off_t in %s", path);
p->crc_offset = 8 + 4 * 256 + nr * hashsz;
}
p->index_version = version;
p->index_data = idx_map;
p->index_size = idx_size;
p->num_objects = nr;
return 0;
}
int open_pack_index(struct packed_git *p)
{
char *idx_name;
size_t len;
int ret;
if (p->index_data)
return 0;
if (!strip_suffix(p->pack_name, ".pack", &len))
BUG("pack_name does not end in .pack");
idx_name = xstrfmt("%.*s.idx", (int)len, p->pack_name);
ret = check_packed_git_idx(idx_name, p);
free(idx_name);
return ret;
}
uint32_t get_pack_fanout(struct packed_git *p, uint32_t value)
{
const uint32_t *level1_ofs = p->index_data;
if (!level1_ofs) {
if (open_pack_index(p))
return 0;
level1_ofs = p->index_data;
}
if (p->index_version > 1) {
level1_ofs += 2;
}
return ntohl(level1_ofs[value]);
}
static struct packed_git *alloc_packed_git(int extra)
{
struct packed_git *p = xmalloc(st_add(sizeof(*p), extra));
memset(p, 0, sizeof(*p));
p->pack_fd = -1;
return p;
}
struct packed_git *parse_pack_index(unsigned char *sha1, const char *idx_path)
{
const char *path = sha1_pack_name(sha1);
size_t alloc = st_add(strlen(path), 1);
struct packed_git *p = alloc_packed_git(alloc);
memcpy(p->pack_name, path, alloc); /* includes NUL */
hashcpy(p->hash, sha1);
if (check_packed_git_idx(idx_path, p)) {
free(p);
return NULL;
}
return p;
}
static void scan_windows(struct packed_git *p,
struct packed_git **lru_p,
struct pack_window **lru_w,
struct pack_window **lru_l)
{
struct pack_window *w, *w_l;
for (w_l = NULL, w = p->windows; w; w = w->next) {
if (!w->inuse_cnt) {
if (!*lru_w || w->last_used < (*lru_w)->last_used) {
*lru_p = p;
*lru_w = w;
*lru_l = w_l;
}
}
w_l = w;
}
}
static int unuse_one_window(struct packed_git *current)
{
struct packed_git *p, *lru_p = NULL;
struct pack_window *lru_w = NULL, *lru_l = NULL;
if (current)
scan_windows(current, &lru_p, &lru_w, &lru_l);
for (p = the_repository->objects->packed_git; p; p = p->next)
scan_windows(p, &lru_p, &lru_w, &lru_l);
if (lru_p) {
munmap(lru_w->base, lru_w->len);
pack_mapped -= lru_w->len;
if (lru_l)
lru_l->next = lru_w->next;
else
lru_p->windows = lru_w->next;
free(lru_w);
pack_open_windows--;
return 1;
}
return 0;
}
void close_pack_windows(struct packed_git *p)
{
while (p->windows) {
struct pack_window *w = p->windows;
if (w->inuse_cnt)
die("pack '%s' still has open windows to it",
p->pack_name);
munmap(w->base, w->len);
pack_mapped -= w->len;
pack_open_windows--;
p->windows = w->next;
free(w);
}
}
int close_pack_fd(struct packed_git *p)
{
if (p->pack_fd < 0)
return 0;
close(p->pack_fd);
pack_open_fds--;
p->pack_fd = -1;
return 1;
}
void close_pack_index(struct packed_git *p)
{
if (p->index_data) {
munmap((void *)p->index_data, p->index_size);
p->index_data = NULL;
}
}
static void close_pack_revindex(struct packed_git *p)
{
packfile: prepare for the existence of '*.rev' files Specify the format of the on-disk reverse index 'pack-*.rev' file, as well as prepare the code for the existence of such files. The reverse index maps from pack relative positions (i.e., an index into the array of object which is sorted by their offsets within the packfile) to their position within the 'pack-*.idx' file. Today, this is done by building up a list of (off_t, uint32_t) tuples for each object (the off_t corresponding to that object's offset, and the uint32_t corresponding to its position in the index). To convert between pack and index position quickly, this array of tuples is radix sorted based on its offset. This has two major drawbacks: First, the in-memory cost scales linearly with the number of objects in a pack. Each 'struct revindex_entry' is sizeof(off_t) + sizeof(uint32_t) + padding bytes for a total of 16. To observe this, force Git to load the reverse index by, for e.g., running 'git cat-file --batch-check="%(objectsize:disk)"'. When asking for a single object in a fresh clone of the kernel, Git needs to allocate 120+ MB of memory in order to hold the reverse index in memory. Second, the cost to sort also scales with the size of the pack. Luckily, this is a linear function since 'load_pack_revindex()' uses a radix sort, but this cost still must be paid once per pack per process. As an example, it takes ~60x longer to print the _size_ of an object as it does to print that entire object's _contents_: Benchmark #1: git.compile cat-file --batch <obj Time (mean ± σ): 3.4 ms ± 0.1 ms [User: 3.3 ms, System: 2.1 ms] Range (min … max): 3.2 ms … 3.7 ms 726 runs Benchmark #2: git.compile cat-file --batch-check="%(objectsize:disk)" <obj Time (mean ± σ): 210.3 ms ± 8.9 ms [User: 188.2 ms, System: 23.2 ms] Range (min … max): 193.7 ms … 224.4 ms 13 runs Instead, avoid computing and sorting the revindex once per process by writing it to a file when the pack itself is generated. The format is relatively straightforward. It contains an array of uint32_t's, the length of which is equal to the number of objects in the pack. The ith entry in this table contains the index position of the ith object in the pack, where "ith object in the pack" is determined by pack offset. One thing that the on-disk format does _not_ contain is the full (up to) eight-byte offset corresponding to each object. This is something that the in-memory revindex contains (it stores an off_t in 'struct revindex_entry' along with the same uint32_t that the on-disk format has). Omit it in the on-disk format, since knowing the index position for some object is sufficient to get a constant-time lookup in the pack-*.idx file to ask for an object's offset within the pack. This trades off between the on-disk size of the 'pack-*.rev' file for runtime to chase down the offset for some object. Even though the lookup is constant time, the constant is heavier, since it can potentially involve two pointer walks in v2 indexes (one to access the 4-byte offset table, and potentially a second to access the double wide offset table). Consider trying to map an object's pack offset to a relative position within that pack. In a cold-cache scenario, more page faults occur while switching between binary searching through the reverse index and searching through the *.idx file for an object's offset. Sure enough, with a cold cache (writing '3' into '/proc/sys/vm/drop_caches' after 'sync'ing), printing out the entire object's contents is still marginally faster than printing its size: Benchmark #1: git.compile cat-file --batch-check="%(objectsize:disk)" <obj >/dev/null Time (mean ± σ): 22.6 ms ± 0.5 ms [User: 2.4 ms, System: 7.9 ms] Range (min … max): 21.4 ms … 23.5 ms 41 runs Benchmark #2: git.compile cat-file --batch <obj >/dev/null Time (mean ± σ): 17.2 ms ± 0.7 ms [User: 2.8 ms, System: 5.5 ms] Range (min … max): 15.6 ms … 18.2 ms 45 runs (Numbers taken in the kernel after cheating and using the next patch to generate a reverse index). There are a couple of approaches to improve cold cache performance not pursued here: - We could include the object offsets in the reverse index format. Predictably, this does result in fewer page faults, but it triples the size of the file, while simultaneously duplicating a ton of data already available in the .idx file. (This was the original way I implemented the format, and it did show `--batch-check='%(objectsize:disk)'` winning out against `--batch`.) On the other hand, this increase in size also results in a large block-cache footprint, which could potentially hurt other workloads. - We could store the mapping from pack to index position in more cache-friendly way, like constructing a binary search tree from the table and writing the values in breadth-first order. This would result in much better locality, but the price you pay is trading O(1) lookup in 'pack_pos_to_index()' for an O(log n) one (since you can no longer directly index the table). So, neither of these approaches are taken here. (Thankfully, the format is versioned, so we are free to pursue these in the future.) But, cold cache performance likely isn't interesting outside of one-off cases like asking for the size of an object directly. In real-world usage, Git is often performing many operations in the revindex (i.e., asking about many objects rather than a single one). The trade-off is worth it, since we will avoid the vast majority of the cost of generating the revindex that the extra pointer chase will look like noise in the following patch's benchmarks. This patch describes the format and prepares callers (like in pack-revindex.c) to be able to read *.rev files once they exist. An implementation of the writer will appear in the next patch, and callers will gradually begin to start using the writer in the patches that follow after that. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2 years ago
if (!p->revindex_map)
return;
munmap((void *)p->revindex_map, p->revindex_size);
p->revindex_map = NULL;
p->revindex_data = NULL;
}
static void close_pack_mtimes(struct packed_git *p)
{
if (!p->mtimes_map)
return;
munmap((void *)p->mtimes_map, p->mtimes_size);
p->mtimes_map = NULL;
}
void close_pack(struct packed_git *p)
{
close_pack_windows(p);
close_pack_fd(p);
close_pack_index(p);
packfile: prepare for the existence of '*.rev' files Specify the format of the on-disk reverse index 'pack-*.rev' file, as well as prepare the code for the existence of such files. The reverse index maps from pack relative positions (i.e., an index into the array of object which is sorted by their offsets within the packfile) to their position within the 'pack-*.idx' file. Today, this is done by building up a list of (off_t, uint32_t) tuples for each object (the off_t corresponding to that object's offset, and the uint32_t corresponding to its position in the index). To convert between pack and index position quickly, this array of tuples is radix sorted based on its offset. This has two major drawbacks: First, the in-memory cost scales linearly with the number of objects in a pack. Each 'struct revindex_entry' is sizeof(off_t) + sizeof(uint32_t) + padding bytes for a total of 16. To observe this, force Git to load the reverse index by, for e.g., running 'git cat-file --batch-check="%(objectsize:disk)"'. When asking for a single object in a fresh clone of the kernel, Git needs to allocate 120+ MB of memory in order to hold the reverse index in memory. Second, the cost to sort also scales with the size of the pack. Luckily, this is a linear function since 'load_pack_revindex()' uses a radix sort, but this cost still must be paid once per pack per process. As an example, it takes ~60x longer to print the _size_ of an object as it does to print that entire object's _contents_: Benchmark #1: git.compile cat-file --batch <obj Time (mean ± σ): 3.4 ms ± 0.1 ms [User: 3.3 ms, System: 2.1 ms] Range (min … max): 3.2 ms … 3.7 ms 726 runs Benchmark #2: git.compile cat-file --batch-check="%(objectsize:disk)" <obj Time (mean ± σ): 210.3 ms ± 8.9 ms [User: 188.2 ms, System: 23.2 ms] Range (min … max): 193.7 ms … 224.4 ms 13 runs Instead, avoid computing and sorting the revindex once per process by writing it to a file when the pack itself is generated. The format is relatively straightforward. It contains an array of uint32_t's, the length of which is equal to the number of objects in the pack. The ith entry in this table contains the index position of the ith object in the pack, where "ith object in the pack" is determined by pack offset. One thing that the on-disk format does _not_ contain is the full (up to) eight-byte offset corresponding to each object. This is something that the in-memory revindex contains (it stores an off_t in 'struct revindex_entry' along with the same uint32_t that the on-disk format has). Omit it in the on-disk format, since knowing the index position for some object is sufficient to get a constant-time lookup in the pack-*.idx file to ask for an object's offset within the pack. This trades off between the on-disk size of the 'pack-*.rev' file for runtime to chase down the offset for some object. Even though the lookup is constant time, the constant is heavier, since it can potentially involve two pointer walks in v2 indexes (one to access the 4-byte offset table, and potentially a second to access the double wide offset table). Consider trying to map an object's pack offset to a relative position within that pack. In a cold-cache scenario, more page faults occur while switching between binary searching through the reverse index and searching through the *.idx file for an object's offset. Sure enough, with a cold cache (writing '3' into '/proc/sys/vm/drop_caches' after 'sync'ing), printing out the entire object's contents is still marginally faster than printing its size: Benchmark #1: git.compile cat-file --batch-check="%(objectsize:disk)" <obj >/dev/null Time (mean ± σ): 22.6 ms ± 0.5 ms [User: 2.4 ms, System: 7.9 ms] Range (min … max): 21.4 ms … 23.5 ms 41 runs Benchmark #2: git.compile cat-file --batch <obj >/dev/null Time (mean ± σ): 17.2 ms ± 0.7 ms [User: 2.8 ms, System: 5.5 ms] Range (min … max): 15.6 ms … 18.2 ms 45 runs (Numbers taken in the kernel after cheating and using the next patch to generate a reverse index). There are a couple of approaches to improve cold cache performance not pursued here: - We could include the object offsets in the reverse index format. Predictably, this does result in fewer page faults, but it triples the size of the file, while simultaneously duplicating a ton of data already available in the .idx file. (This was the original way I implemented the format, and it did show `--batch-check='%(objectsize:disk)'` winning out against `--batch`.) On the other hand, this increase in size also results in a large block-cache footprint, which could potentially hurt other workloads. - We could store the mapping from pack to index position in more cache-friendly way, like constructing a binary search tree from the table and writing the values in breadth-first order. This would result in much better locality, but the price you pay is trading O(1) lookup in 'pack_pos_to_index()' for an O(log n) one (since you can no longer directly index the table). So, neither of these approaches are taken here. (Thankfully, the format is versioned, so we are free to pursue these in the future.) But, cold cache performance likely isn't interesting outside of one-off cases like asking for the size of an object directly. In real-world usage, Git is often performing many operations in the revindex (i.e., asking about many objects rather than a single one). The trade-off is worth it, since we will avoid the vast majority of the cost of generating the revindex that the extra pointer chase will look like noise in the following patch's benchmarks. This patch describes the format and prepares callers (like in pack-revindex.c) to be able to read *.rev files once they exist. An implementation of the writer will appear in the next patch, and callers will gradually begin to start using the writer in the patches that follow after that. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2 years ago
close_pack_revindex(p);
close_pack_mtimes(p);
oidset_clear(&p->bad_objects);
}
void close_object_store(struct raw_object_store *o)
{
struct packed_git *p;
for (p = o->packed_git; p; p = p->next)
if (p->do_not_close)
BUG("want to close pack marked 'do-not-close'");
else
close_pack(p);
if (o->multi_pack_index) {
close_midx(o->multi_pack_index);
o->multi_pack_index = NULL;
}
close_commit_graph(o);
}
void unlink_pack_path(const char *pack_name, int force_delete)
{
static const char *exts[] = {".pack", ".idx", ".rev", ".keep", ".bitmap", ".promisor", ".mtimes"};
int i;
struct strbuf buf = STRBUF_INIT;
size_t plen;
strbuf_addstr(&buf, pack_name);
strip_suffix_mem(buf.buf, &buf.len, ".pack");
plen = buf.len;
if (!force_delete) {
strbuf_addstr(&buf, ".keep");
if (!access(buf.buf, F_OK)) {
strbuf_release(&buf);
return;
}
}
for (i = 0; i < ARRAY_SIZE(exts); i++) {
strbuf_setlen(&buf, plen);
strbuf_addstr(&buf, exts[i]);
unlink(buf.buf);
}
strbuf_release(&buf);
}
/*
* The LRU pack is the one with the oldest MRU window, preferring packs
* with no used windows, or the oldest mtime if it has no windows allocated.
*/
static void find_lru_pack(struct packed_git *p, struct packed_git **lru_p, struct pack_window **mru_w, int *accept_windows_inuse)
{
struct pack_window *w, *this_mru_w;
int has_windows_inuse = 0;
/*
* Reject this pack if it has windows and the previously selected
* one does not. If this pack does not have windows, reject
* it if the pack file is newer than the previously selected one.
*/
if (*lru_p && !*mru_w && (p->windows || p->mtime > (*lru_p)->mtime))
return;
for (w = this_mru_w = p->windows; w; w = w->next) {
/*
* Reject this pack if any of its windows are in use,
* but the previously selected pack did not have any
* inuse windows. Otherwise, record that this pack
* has windows in use.
*/
if (w->inuse_cnt) {
if (*accept_windows_inuse)
has_windows_inuse = 1;
else
return;
}
if (w->last_used > this_mru_w->last_used)
this_mru_w = w;
/*
* Reject this pack if it has windows that have been
* used more recently than the previously selected pack.
* If the previously selected pack had windows inuse and
* we have not encountered a window in this pack that is
* inuse, skip this check since we prefer a pack with no
* inuse windows to one that has inuse windows.
*/
if (*mru_w && *accept_windows_inuse == has_windows_inuse &&
this_mru_w->last_used > (*mru_w)->last_used)
return;
}
/*
* Select this pack.
*/
*mru_w = this_mru_w;
*lru_p = p;
*accept_windows_inuse = has_windows_inuse;
}
static int close_one_pack(void)
{
struct packed_git *p, *lru_p = NULL;
struct pack_window *mru_w = NULL;
int accept_windows_inuse = 1;
for (p = the_repository->objects->packed_git; p; p = p->next) {
if (p->pack_fd == -1)
continue;
find_lru_pack(p, &lru_p, &mru_w, &accept_windows_inuse);
}
if (lru_p)
return close_pack_fd(lru_p);
return 0;
}
static unsigned int get_max_fd_limit(void)
{
#ifdef RLIMIT_NOFILE
{
struct rlimit lim;
if (!getrlimit(RLIMIT_NOFILE, &lim))
return lim.rlim_cur;
}
#endif
#ifdef _SC_OPEN_MAX
{
long open_max = sysconf(_SC_OPEN_MAX);
if (0 < open_max)
return open_max;
/*
* Otherwise, we got -1 for one of the two
* reasons:
*
* (1) sysconf() did not understand _SC_OPEN_MAX
* and signaled an error with -1; or
* (2) sysconf() said there is no limit.
*
* We _could_ clear errno before calling sysconf() to
* tell these two cases apart and return a huge number
* in the latter case to let the caller cap it to a
* value that is not so selfish, but letting the
* fallback OPEN_MAX codepath take care of these cases
* is a lot simpler.
*/
}
#endif
#ifdef OPEN_MAX
return OPEN_MAX;
#else
return 1; /* see the caller ;-) */
#endif
}
const char *pack_basename(struct packed_git *p)
{
const char *ret = strrchr(p->pack_name, '/');
if (ret)
ret = ret + 1; /* skip past slash */
else
ret = p->pack_name; /* we only have a base */
return ret;
}
/*
* Do not call this directly as this leaks p->pack_fd on error return;
* call open_packed_git() instead.
*/
static int open_packed_git_1(struct packed_git *p)
{
struct stat st;
struct pack_header hdr;
unsigned char hash[GIT_MAX_RAWSZ];
unsigned char *idx_hash;
ssize_t read_result;
const unsigned hashsz = the_hash_algo->rawsz;
packfile.c: protect against disappearing indexes In 17c35c8969 (packfile: skip loading index if in multi-pack-index, 2018-07-12) we stopped loading the .idx file for packs that are contained within a multi-pack index. This saves us the effort of loading an .idx and doing some lightweight validity checks by way of 'packfile.c:load_idx()', but introduces a race between processes that need to load the index (e.g., to generate a reverse index) and processes that can delete the index. For example, running the following in your shell: $ git init repo && cd repo $ git commit --allow-empty -m 'base' $ git repack -ad && git multi-pack-index write followed by: $ rm -f .git/objects/pack/pack-*.idx $ git rev-parse HEAD | git cat-file --batch-check='%(objectsize:disk)' will result in a segfault prior to this patch. What's happening here is that we notice that the pack is in the multi-pack index, and so don't check that it still has a .idx. When we then try and load that index to generate a reverse index, we don't have it, so the call to 'find_pack_revindex()' in 'packfile.c:packed_object_info()' returns NULL, and then dereferencing it causes a segfault. Of course, we don't ever expect someone to remove the index file by hand, or to be in a state where we never wrote it to begin with (yet find that pack in the multi-pack-index). But, this can happen in a timing race with 'git repack -ad', which removes all existing packs after writing a new pack containing all of their objects. Avoid this by reverting the hunk of 17c35c8969 which stops loading the index when the pack is contained in a MIDX. This makes the latter half of 17c35c8969 useless, since we'll always have a non-NULL 'p->index_data', in which case that if statement isn't guarding anything. These two together effectively revert 17c35c8969, and avoid the race explained above. Co-authored-by: Jeff King <peff@peff.net> Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2 years ago
if (open_pack_index(p))
return error("packfile %s index unavailable", p->pack_name);
if (!pack_max_fds) {
unsigned int max_fds = get_max_fd_limit();
/* Save 3 for stdin/stdout/stderr, 22 for work */
if (25 < max_fds)
pack_max_fds = max_fds - 25;
else
pack_max_fds = 1;
}
while (pack_max_fds <= pack_open_fds && close_one_pack())
; /* nothing */
p->pack_fd = git_open(p->pack_name);
if (p->pack_fd < 0 || fstat(p->pack_fd, &st))
return -1;
pack_open_fds++;
/* If we created the struct before we had the pack we lack size. */
if (!p->pack_size) {
if (!S_ISREG(st.st_mode))
return error("packfile %s not a regular file", p->pack_name);
p->pack_size = st.st_size;
} else if (p->pack_size != st.st_size)
return error("packfile %s size changed", p->pack_name);
/* Verify we recognize this pack file format. */
read_result = read_in_full(p->pack_fd, &hdr, sizeof(hdr));
if (read_result < 0)
return error_errno("error reading from %s", p->pack_name);
if (read_result != sizeof(hdr))
return error("file %s is far too short to be a packfile", p->pack_name);
if (hdr.hdr_signature != htonl(PACK_SIGNATURE))
return error("file %s is not a GIT packfile", p->pack_name);
if (!pack_version_ok(hdr.hdr_version))
return error("packfile %s is version %"PRIu32" and not"
" supported (try upgrading GIT to a newer version)",
p->pack_name, ntohl(hdr.hdr_version));
/* Verify the pack matches its index. */
if (p->num_objects != ntohl(hdr.hdr_entries))
return error("packfile %s claims to have %"PRIu32" objects"
" while index indicates %"PRIu32" objects",
p->pack_name, ntohl(hdr.hdr_entries),
p->num_objects);
read_result = pread_in_full(p->pack_fd, hash, hashsz,
p->pack_size - hashsz);
if (read_result < 0)
return error_errno("error reading from %s", p->pack_name);
if (read_result != hashsz)
return error("packfile %s signature is unavailable", p->pack_name);
idx_hash = ((unsigned char *)p->index_data) + p->index_size - hashsz * 2;
if (!hasheq(hash, idx_hash))
return error("packfile %s does not match index", p->pack_name);
return 0;
}
static int open_packed_git(struct packed_git *p)
{
if (!open_packed_git_1(p))
return 0;
close_pack_fd(p);
return -1;
}
static int in_window(struct pack_window *win, off_t offset)
{
/* We must promise at least one full hash after the
* offset is available from this window, otherwise the offset
* is not actually in this window and a different window (which
* has that one hash excess) must be used. This is to support
* the object header and delta base parsing routines below.
*/
off_t win_off = win->offset;
return win_off <= offset
&& (offset + the_hash_algo->rawsz) <= (win_off + win->len);
}
unsigned char *use_pack(struct packed_git *p,
struct pack_window **w_cursor,
off_t offset,
unsigned long *left)
{
struct pack_window *win = *w_cursor;
/* Since packfiles end in a hash of their content and it's
* pointless to ask for an offset into the middle of that
* hash, and the in_window function above wouldn't match
* don't allow an offset too close to the end of the file.
*/
if (!p->pack_size && p->pack_fd == -1 && open_packed_git(p))
die("packfile %s cannot be accessed", p->pack_name);
if (offset > (p->pack_size - the_hash_algo->rawsz))
die("offset beyond end of packfile (truncated pack?)");
if (offset < 0)
die(_("offset before end of packfile (broken .idx?)"));
if (!win || !in_window(win, offset)) {
if (win)
win->inuse_cnt--;
for (win = p->windows; win; win = win->next) {
if (in_window(win, offset))
break;
}
if (!win) {
size_t window_align = packed_git_window_size / 2;
off_t len;
if (p->pack_fd == -1 && open_packed_git(p))
die("packfile %s cannot be accessed", p->pack_name);
CALLOC_ARRAY(win, 1);
win->offset = (offset / window_align) * window_align;
len = p->pack_size - win->offset;
if (len > packed_git_window_size)
len = packed_git_window_size;
win->len = (size_t)len;
pack_mapped += win->len;
while (packed_git_limit < pack_mapped
&& unuse_one_window(p))
; /* nothing */
win->base = xmmap_gently(NULL, win->len,
PROT_READ, MAP_PRIVATE,
p->pack_fd, win->offset);
if (win->base == MAP_FAILED)
die_errno(_("packfile %s cannot be mapped%s"),
p->pack_name, mmap_os_err());
if (!win->offset && win->len == p->pack_size
&& !p->do_not_close)
close_pack_fd(p);
pack_mmap_calls++;
pack_open_windows++;
if (pack_mapped > peak_pack_mapped)
peak_pack_mapped = pack_mapped;
if (pack_open_windows > peak_pack_open_windows)
peak_pack_open_windows = pack_open_windows;
win->next = p->windows;
p->windows = win;
}
}
if (win != *w_cursor) {
win->last_used = pack_used_ctr++;
win->inuse_cnt++;
*w_cursor = win;
}
offset -= win->offset;
if (left)
*left = win->len - xsize_t(offset);
return win->base + offset;
}
void unuse_pack(struct pack_window **w_cursor)
{
struct pack_window *w = *w_cursor;
if (w) {
w->inuse_cnt--;
*w_cursor = NULL;
}
}
struct packed_git *add_packed_git(const char *path, size_t path_len, int local)
{
struct stat st;
size_t alloc;
struct packed_git *p;
/*
* Make sure a corresponding .pack file exists and that
* the index looks