git/fetch-pack.c

2246 lines
62 KiB
C
Raw Permalink Normal View History

#include "git-compat-util.h"
#include "repository.h"
#include "config.h"
#include "date.h"
#include "environment.h"
#include "gettext.h"
#include "hex.h"
#include "lockfile.h"
#include "refs.h"
#include "pkt-line.h"
#include "commit.h"
#include "tag.h"
#include "exec-cmd.h"
#include "pack.h"
#include "sideband.h"
#include "fetch-pack.h"
#include "remote.h"
#include "run-command.h"
#include "connect.h"
#include "trace2.h"
#include "transport.h"
#include "version.h"
#include "oid-array.h"
#include "oidset.h"
#include "packfile.h"
#include "object-store-ll.h"
#include "path.h"
fetch-pack: write shallow, then check connectivity When fetching, connectivity is checked after the shallow file is updated. There are 2 issues with this: (1) the connectivity check is only performed up to ancestors of existing refs (which is not thorough enough if we were deepening an existing ref in the first place), and (2) there is no rollback of the shallow file if the connectivity check fails. To solve (1), update the connectivity check to check the ancestry chain completely in the case of a deepening fetch by refraining from passing "--not --all" when invoking rev-list in connected.c. To solve (2), have fetch_pack() perform its own connectivity check before updating the shallow file. To support existing use cases in which "git fetch-pack" is used to download objects without much regard as to the connectivity of the resulting objects with respect to the existing repository, the connectivity check is only done if necessary (that is, the fetch is not a clone, and the fetch involves shallow/deepen functionality). "git fetch" still performs its own connectivity check, preserving correctness but sometimes performing redundant work. This redundancy is mitigated by the fact that fetch_pack() reports if it has performed a connectivity check itself, and if the transport supports connect or stateless-connect, it will bubble up that report so that "git fetch" knows not to perform the connectivity check in such a case. This was noticed when a user tried to deepen an existing repository by fetching with --no-shallow from a server that did not send all necessary objects - the connectivity check as run by "git fetch" succeeded, but a subsequent "git fsck" failed. Signed-off-by: Jonathan Tan <jonathantanmy@google.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-03 00:08:43 +02:00
#include "connected.h"
#include "fetch-negotiator.h"
fetch: implement fetch.fsck.* Implement support for fetch.fsck.* corresponding with the existing receive.fsck.*. This allows for pedantically cloning repositories with specific issues without turning off fetch.fsckObjects. One such repository is https://github.com/robbyrussell/oh-my-zsh.git which before this change will emit this error when cloned with fetch.fsckObjects: error: object 2b7227859263b6aabcc28355b0b994995b7148b6: zeroPaddedFilemode: contains zero-padded file modes fatal: Error in object fatal: index-pack failed Now with fetch.fsck.zeroPaddedFilemode=warn we'll warn about that issue, but the clone will succeed: warning: object 2b7227859263b6aabcc28355b0b994995b7148b6: zeroPaddedFilemode: contains zero-padded file modes warning: object a18c4d13c2a5fa2d4ecd5346c50e119b999b807d: zeroPaddedFilemode: contains zero-padded file modes warning: object 84df066176c8da3fd59b13731a86d90f4f1e5c9d: zeroPaddedFilemode: contains zero-padded file modes The motivation for this is to be able to turn on fetch.fsckObjects globally across a fleet of computers but still be able to manually clone various legacy repositories by either white-listing specific issues, or better yet whitelist specific objects. The use of --git-dir=* instead of -C in the tests could be considered somewhat archaic, but the tests I'm adding here are duplicating the corresponding receive.* tests with as few changes as possible. Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-27 16:37:17 +02:00
#include "fsck.h"
#include "shallow.h"
fetch: teach independent negotiation (no packfile) Currently, the packfile negotiation step within a Git fetch cannot be done independent of sending the packfile, even though there is at least one application wherein this is useful. Therefore, make it possible for this negotiation step to be done independently. A subsequent commit will use this for one such application - push negotiation. This feature is for protocol v2 only. (An implementation for protocol v0 would require a separate implementation in the fetch, transport, and transport helper code.) In the protocol, the main hindrance towards independent negotiation is that the server can unilaterally decide to send the packfile. This is solved by a "wait-for-done" argument: the server will then wait for the client to say "done". In practice, the client will never say it; instead it will cease requests once it is satisfied. In the client, the main change lies in the transport and transport helper code. fetch_refs_via_pack() performs everything needed - protocol version and capability checks, and the negotiation itself. There are 2 code paths that do not go through fetch_refs_via_pack() that needed to be individually excluded: the bundle transport (excluded through requiring smart_options, which the bundle transport doesn't support) and transport helpers that do not support takeover. If or when we support independent negotiation for protocol v0, we will need to modify these 2 code paths to support it. But for now, report failure if independent negotiation is requested in these cases. Signed-off-by: Jonathan Tan <jonathantanmy@google.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-04 23:16:01 +02:00
#include "commit-reach.h"
#include "commit-graph.h"
fetch-pack: ignore SIGPIPE when writing to index-pack When fetching, we send the incoming pack to index-pack (or unpack-objects) via the sideband demuxer. If index-pack hits an error (e.g., because an object fails fsck), then it will die immediately. This may cause us to get SIGPIPE on the fetch, as we're still trying to write pack contents from the sideband demuxer (which is typically a thread, and thus takes down the whole fetch process). You can see this in action with: ./t5702-protocol-v2.sh --stress --run=59 which ends with (wrapped for readability): test_must_fail: died by signal 13: git -c protocol.version=2 \ -c transfer.fsckobjects=1 -c fetch.uriprotocols=http,https \ clone http://127.0.0.1:5708/smart/http_parent http_child not ok 59 - packfile-uri with transfer.fsckobjects fails on bad object This is mostly cosmetic. The actual error of interest (in this case, the object that failed the fsck check) comes from index-pack straight to stderr, so the user still sees it. They _might_ even see fetch-pack complaining about index-pack failing, because the main thread is racing with the sideband-demuxer. But they'll definitely see the signal death in the exit code, which is what the test is complaining about. We can make this more predictable by just ignoring SIGPIPE. The sideband demuxer uses write_or_die(), so it will notice and stop (gracefully, because we hook die_routine() to exit just the thread). And during this section we're not writing anywhere else where we'd be concerned about SIGPIPE preventing us from wasting effort writing to nowhere. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-11-19 21:58:55 +01:00
#include "sigchain.h"
#include "mergesort.h"
static int transfer_unpack_limit = -1;
static int fetch_unpack_limit = -1;
static int unpack_limit = 100;
static int prefer_ofs_delta = 1;
static int no_done;
static int deepen_since_ok;
static int deepen_not_ok;
static int fetch_fsck_objects = -1;
static int transfer_fsck_objects = -1;
static int agent_supported;
static int server_supports_filtering;
static int advertise_sid;
static struct shallow_lock shallow_lock;
static const char *alternate_shallow_file;
static struct fsck_options fsck_options = FSCK_OPTIONS_MISSING_GITMODULES;
fetch: implement fetch.fsck.* Implement support for fetch.fsck.* corresponding with the existing receive.fsck.*. This allows for pedantically cloning repositories with specific issues without turning off fetch.fsckObjects. One such repository is https://github.com/robbyrussell/oh-my-zsh.git which before this change will emit this error when cloned with fetch.fsckObjects: error: object 2b7227859263b6aabcc28355b0b994995b7148b6: zeroPaddedFilemode: contains zero-padded file modes fatal: Error in object fatal: index-pack failed Now with fetch.fsck.zeroPaddedFilemode=warn we'll warn about that issue, but the clone will succeed: warning: object 2b7227859263b6aabcc28355b0b994995b7148b6: zeroPaddedFilemode: contains zero-padded file modes warning: object a18c4d13c2a5fa2d4ecd5346c50e119b999b807d: zeroPaddedFilemode: contains zero-padded file modes warning: object 84df066176c8da3fd59b13731a86d90f4f1e5c9d: zeroPaddedFilemode: contains zero-padded file modes The motivation for this is to be able to turn on fetch.fsckObjects globally across a fleet of computers but still be able to manually clone various legacy repositories by either white-listing specific issues, or better yet whitelist specific objects. The use of --git-dir=* instead of -C in the tests could be considered somewhat archaic, but the tests I'm adding here are duplicating the corresponding receive.* tests with as few changes as possible. Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-27 16:37:17 +02:00
static struct strbuf fsck_msg_types = STRBUF_INIT;
static struct string_list uri_protocols = STRING_LIST_INIT_DUP;
/* Remember to update object flag allocation in object.h */
#define COMPLETE (1U << 0)
#define ALTERNATE (1U << 1)
fetch: teach independent negotiation (no packfile) Currently, the packfile negotiation step within a Git fetch cannot be done independent of sending the packfile, even though there is at least one application wherein this is useful. Therefore, make it possible for this negotiation step to be done independently. A subsequent commit will use this for one such application - push negotiation. This feature is for protocol v2 only. (An implementation for protocol v0 would require a separate implementation in the fetch, transport, and transport helper code.) In the protocol, the main hindrance towards independent negotiation is that the server can unilaterally decide to send the packfile. This is solved by a "wait-for-done" argument: the server will then wait for the client to say "done". In practice, the client will never say it; instead it will cease requests once it is satisfied. In the client, the main change lies in the transport and transport helper code. fetch_refs_via_pack() performs everything needed - protocol version and capability checks, and the negotiation itself. There are 2 code paths that do not go through fetch_refs_via_pack() that needed to be individually excluded: the bundle transport (excluded through requiring smart_options, which the bundle transport doesn't support) and transport helpers that do not support takeover. If or when we support independent negotiation for protocol v0, we will need to modify these 2 code paths to support it. But for now, report failure if independent negotiation is requested in these cases. Signed-off-by: Jonathan Tan <jonathantanmy@google.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-04 23:16:01 +02:00
#define COMMON (1U << 6)
#define REACH_SCRATCH (1U << 7)
/*
* After sending this many "have"s if we do not get any new ACK , we
* give up traversing our history.
*/
#define MAX_IN_VAIN 256
static int multi_ack, use_sideband;
/* Allow specifying sha1 if it is a ref tip. */
#define ALLOW_TIP_SHA1 01
/* Allow request of a sha1 if it is reachable from a ref (possibly hidden ref). */
#define ALLOW_REACHABLE_SHA1 02
static unsigned int allow_unadvertised_object_request;
__attribute__((format (printf, 2, 3)))
static inline void print_verbose(const struct fetch_pack_args *args,
const char *fmt, ...)
{
va_list params;
if (!args->verbose)
return;
va_start(params, fmt);
vfprintf(stderr, fmt, params);
va_end(params);
fputc('\n', stderr);
}
fetch-pack: cache results of for_each_alternate_ref We may run for_each_alternate_ref() twice, once in find_common() and once in everything_local(). This operation can be expensive, because it involves running a sub-process which must freshly load all of the alternate's refs from disk. Let's cache and reuse the results between the two calls. We can make some optimizations based on the particular use pattern in fetch-pack to keep our memory usage down. The first is that we only care about the sha1s, not the refs themselves. So it's OK to store only the sha1s, and to suppress duplicates. The natural fit would therefore be a sha1_array. However, sha1_array's de-duplication happens only after it has read and sorted all entries. It still stores each duplicate. For an alternate with a large number of refs pointing to the same commits, this is a needless expense. Instead, we'd prefer to eliminate duplicates before putting them in the cache, which implies using a hash. We can further note that fetch-pack will call parse_object() on each alternate sha1. We can therefore keep our cache as a set of pointers to "struct object". That gives us a place to put our "already seen" bit with an optimized hash lookup. And as a bonus, the object stores the sha1 for us, so pointer-to-object is all we need. There are two extra optimizations I didn't do here: - we actually store an array of pointer-to-object. Technically we could just walk the obj_hash table looking for entries with the ALTERNATE flag set (because our use case doesn't care about the order here). But that hash table may be mostly composed of non-ALTERNATE entries, so we'd waste time walking over them. So it would be a slight win in memory use, but a loss in CPU. - the items we pull out of the cache are actual "struct object"s, but then we feed "obj->sha1" to our sub-functions, which promptly call parse_object(). This second parse is cheap, because it starts with lookup_object() and will bail immediately when it sees we've already parsed the object. We could save the extra hash lookup, but it would involve refactoring the functions we call. It may or may not be worth the trouble. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-08 21:53:03 +01:00
struct alternate_object_cache {
struct object **items;
size_t nr, alloc;
};
static void cache_one_alternate(const struct object_id *oid,
fetch-pack: cache results of for_each_alternate_ref We may run for_each_alternate_ref() twice, once in find_common() and once in everything_local(). This operation can be expensive, because it involves running a sub-process which must freshly load all of the alternate's refs from disk. Let's cache and reuse the results between the two calls. We can make some optimizations based on the particular use pattern in fetch-pack to keep our memory usage down. The first is that we only care about the sha1s, not the refs themselves. So it's OK to store only the sha1s, and to suppress duplicates. The natural fit would therefore be a sha1_array. However, sha1_array's de-duplication happens only after it has read and sorted all entries. It still stores each duplicate. For an alternate with a large number of refs pointing to the same commits, this is a needless expense. Instead, we'd prefer to eliminate duplicates before putting them in the cache, which implies using a hash. We can further note that fetch-pack will call parse_object() on each alternate sha1. We can therefore keep our cache as a set of pointers to "struct object". That gives us a place to put our "already seen" bit with an optimized hash lookup. And as a bonus, the object stores the sha1 for us, so pointer-to-object is all we need. There are two extra optimizations I didn't do here: - we actually store an array of pointer-to-object. Technically we could just walk the obj_hash table looking for entries with the ALTERNATE flag set (because our use case doesn't care about the order here). But that hash table may be mostly composed of non-ALTERNATE entries, so we'd waste time walking over them. So it would be a slight win in memory use, but a loss in CPU. - the items we pull out of the cache are actual "struct object"s, but then we feed "obj->sha1" to our sub-functions, which promptly call parse_object(). This second parse is cheap, because it starts with lookup_object() and will bail immediately when it sees we've already parsed the object. We could save the extra hash lookup, but it would involve refactoring the functions we call. It may or may not be worth the trouble. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-08 21:53:03 +01:00
void *vcache)
{
struct alternate_object_cache *cache = vcache;
struct object *obj = parse_object(the_repository, oid);
fetch-pack: cache results of for_each_alternate_ref We may run for_each_alternate_ref() twice, once in find_common() and once in everything_local(). This operation can be expensive, because it involves running a sub-process which must freshly load all of the alternate's refs from disk. Let's cache and reuse the results between the two calls. We can make some optimizations based on the particular use pattern in fetch-pack to keep our memory usage down. The first is that we only care about the sha1s, not the refs themselves. So it's OK to store only the sha1s, and to suppress duplicates. The natural fit would therefore be a sha1_array. However, sha1_array's de-duplication happens only after it has read and sorted all entries. It still stores each duplicate. For an alternate with a large number of refs pointing to the same commits, this is a needless expense. Instead, we'd prefer to eliminate duplicates before putting them in the cache, which implies using a hash. We can further note that fetch-pack will call parse_object() on each alternate sha1. We can therefore keep our cache as a set of pointers to "struct object". That gives us a place to put our "already seen" bit with an optimized hash lookup. And as a bonus, the object stores the sha1 for us, so pointer-to-object is all we need. There are two extra optimizations I didn't do here: - we actually store an array of pointer-to-object. Technically we could just walk the obj_hash table looking for entries with the ALTERNATE flag set (because our use case doesn't care about the order here). But that hash table may be mostly composed of non-ALTERNATE entries, so we'd waste time walking over them. So it would be a slight win in memory use, but a loss in CPU. - the items we pull out of the cache are actual "struct object"s, but then we feed "obj->sha1" to our sub-functions, which promptly call parse_object(). This second parse is cheap, because it starts with lookup_object() and will bail immediately when it sees we've already parsed the object. We could save the extra hash lookup, but it would involve refactoring the functions we call. It may or may not be worth the trouble. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-08 21:53:03 +01:00
if (!obj || (obj->flags & ALTERNATE))
return;
obj->flags |= ALTERNATE;
ALLOC_GROW(cache->items, cache->nr + 1, cache->alloc);
cache->items[cache->nr++] = obj;
}
static void for_each_cached_alternate(struct fetch_negotiator *negotiator,
void (*cb)(struct fetch_negotiator *,
struct object *))
fetch-pack: cache results of for_each_alternate_ref We may run for_each_alternate_ref() twice, once in find_common() and once in everything_local(). This operation can be expensive, because it involves running a sub-process which must freshly load all of the alternate's refs from disk. Let's cache and reuse the results between the two calls. We can make some optimizations based on the particular use pattern in fetch-pack to keep our memory usage down. The first is that we only care about the sha1s, not the refs themselves. So it's OK to store only the sha1s, and to suppress duplicates. The natural fit would therefore be a sha1_array. However, sha1_array's de-duplication happens only after it has read and sorted all entries. It still stores each duplicate. For an alternate with a large number of refs pointing to the same commits, this is a needless expense. Instead, we'd prefer to eliminate duplicates before putting them in the cache, which implies using a hash. We can further note that fetch-pack will call parse_object() on each alternate sha1. We can therefore keep our cache as a set of pointers to "struct object". That gives us a place to put our "already seen" bit with an optimized hash lookup. And as a bonus, the object stores the sha1 for us, so pointer-to-object is all we need. There are two extra optimizations I didn't do here: - we actually store an array of pointer-to-object. Technically we could just walk the obj_hash table looking for entries with the ALTERNATE flag set (because our use case doesn't care about the order here). But that hash table may be mostly composed of non-ALTERNATE entries, so we'd waste time walking over them. So it would be a slight win in memory use, but a loss in CPU. - the items we pull out of the cache are actual "struct object"s, but then we feed "obj->sha1" to our sub-functions, which promptly call parse_object(). This second parse is cheap, because it starts with lookup_object() and will bail immediately when it sees we've already parsed the object. We could save the extra hash lookup, but it would involve refactoring the functions we call. It may or may not be worth the trouble. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-08 21:53:03 +01:00
{
static int initialized;
static struct alternate_object_cache cache;
size_t i;
if (!initialized) {
for_each_alternate_ref(cache_one_alternate, &cache);
initialized = 1;
}
for (i = 0; i < cache.nr; i++)
cb(negotiator, cache.items[i]);
}
static struct commit *deref_without_lazy_fetch_extended(const struct object_id *oid,
int mark_tags_complete,
enum object_type *type,
unsigned int oi_flags)
{
struct object_info info = { .typep = type };
struct commit *commit;
commit = lookup_commit_in_graph(the_repository, oid);
if (commit)
return commit;
while (1) {
if (oid_object_info_extended(the_repository, oid, &info,
oi_flags))
return NULL;
if (*type == OBJ_TAG) {
struct tag *tag = (struct tag *)
parse_object(the_repository, oid);
if (!tag->tagged)
return NULL;
if (mark_tags_complete)
tag->object.flags |= COMPLETE;
oid = &tag->tagged->oid;
} else {
break;
}
}
fetch-pack: speed up loading of refs via commit graph When doing reference negotiation, git-fetch-pack(1) is loading all refs from disk in order to determine which commits it has in common with the remote repository. This can be quite expensive in repositories with many references though: in a real-world repository with around 2.2 million refs, fetching a single commit by its ID takes around 44 seconds. Dominating the loading time is decompression and parsing of the objects which are referenced by commits. Given the fact that we only care about commits (or tags which can be peeled to one) in this context, there is thus an easy performance win by switching the parsing logic to make use of the commit graph in case we have one available. Like this, we avoid hitting the object database to parse these commits but instead only load them from the commit-graph. This results in a significant performance boost when executing git-fetch in said repository with 2.2 million refs: Benchmark #1: HEAD~: git fetch $remote $commit Time (mean ± σ): 44.168 s ± 0.341 s [User: 42.985 s, System: 1.106 s] Range (min … max): 43.565 s … 44.577 s 10 runs Benchmark #2: HEAD: git fetch $remote $commit Time (mean ± σ): 19.498 s ± 0.724 s [User: 18.751 s, System: 0.690 s] Range (min … max): 18.629 s … 20.454 s 10 runs Summary 'HEAD: git fetch $remote $commit' ran 2.27 ± 0.09 times faster than 'HEAD~: git fetch $remote $commit' Signed-off-by: Patrick Steinhardt <ps@pks.im> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-08-04 15:56:11 +02:00
if (*type == OBJ_COMMIT) {
fetch-pack: speed up loading of refs via commit graph When doing reference negotiation, git-fetch-pack(1) is loading all refs from disk in order to determine which commits it has in common with the remote repository. This can be quite expensive in repositories with many references though: in a real-world repository with around 2.2 million refs, fetching a single commit by its ID takes around 44 seconds. Dominating the loading time is decompression and parsing of the objects which are referenced by commits. Given the fact that we only care about commits (or tags which can be peeled to one) in this context, there is thus an easy performance win by switching the parsing logic to make use of the commit graph in case we have one available. Like this, we avoid hitting the object database to parse these commits but instead only load them from the commit-graph. This results in a significant performance boost when executing git-fetch in said repository with 2.2 million refs: Benchmark #1: HEAD~: git fetch $remote $commit Time (mean ± σ): 44.168 s ± 0.341 s [User: 42.985 s, System: 1.106 s] Range (min … max): 43.565 s … 44.577 s 10 runs Benchmark #2: HEAD: git fetch $remote $commit Time (mean ± σ): 19.498 s ± 0.724 s [User: 18.751 s, System: 0.690 s] Range (min … max): 18.629 s … 20.454 s 10 runs Summary 'HEAD: git fetch $remote $commit' ran 2.27 ± 0.09 times faster than 'HEAD~: git fetch $remote $commit' Signed-off-by: Patrick Steinhardt <ps@pks.im> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-08-04 15:56:11 +02:00
struct commit *commit = lookup_commit(the_repository, oid);
if (!commit || repo_parse_commit(the_repository, commit))
return NULL;
return commit;
}
return NULL;
}
static struct commit *deref_without_lazy_fetch(const struct object_id *oid,
int mark_tags_complete)
{
enum object_type type;
unsigned flags = OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK;
return deref_without_lazy_fetch_extended(oid, mark_tags_complete,
&type, flags);
}
static int rev_list_insert_ref(struct fetch_negotiator *negotiator,
const struct object_id *oid)
{
struct commit *c = deref_without_lazy_fetch(oid, 0);
if (c)
negotiator->add_tip(negotiator, c);
return 0;
}
static int rev_list_insert_ref_oid(const char *refname UNUSED,
const struct object_id *oid,
int flag UNUSED,
void *cb_data)
{
return rev_list_insert_ref(cb_data, oid);
}
enum ack_type {
NAK = 0,
ACK,
ACK_continue,
ACK_common,
ACK_ready
};
static void consume_shallow_list(struct fetch_pack_args *args,
struct packet_reader *reader)
{
if (args->stateless_rpc && args->deepen) {
/* If we sent a depth we will get back "duplicate"
* shallow and unshallow commands every time there
* is a block of have lines exchanged.
*/
while (packet_reader_read(reader) == PACKET_READ_NORMAL) {
if (starts_with(reader->line, "shallow "))
continue;
if (starts_with(reader->line, "unshallow "))
continue;
die(_("git fetch-pack: expected shallow list"));
}
if (reader->status != PACKET_READ_FLUSH)
die(_("git fetch-pack: expected a flush packet after shallow list"));
}
}
static enum ack_type get_ack(struct packet_reader *reader,
struct object_id *result_oid)
{
pkt-line: provide a LARGE_PACKET_MAX static buffer Most of the callers of packet_read_line just read into a static 1000-byte buffer (callers which handle arbitrary binary data already use LARGE_PACKET_MAX). This works fine in practice, because: 1. The only variable-sized data in these lines is a ref name, and refs tend to be a lot shorter than 1000 characters. 2. When sending ref lines, git-core always limits itself to 1000 byte packets. However, the only limit given in the protocol specification in Documentation/technical/protocol-common.txt is LARGE_PACKET_MAX; the 1000 byte limit is mentioned only in pack-protocol.txt, and then only describing what we write, not as a specific limit for readers. This patch lets us bump the 1000-byte limit to LARGE_PACKET_MAX. Even though git-core will never write a packet where this makes a difference, there are two good reasons to do this: 1. Other git implementations may have followed protocol-common.txt and used a larger maximum size. We don't bump into it in practice because it would involve very long ref names. 2. We may want to increase the 1000-byte limit one day. Since packets are transferred before any capabilities, it's difficult to do this in a backwards-compatible way. But if we bump the size of buffer the readers can handle, eventually older versions of git will be obsolete enough that we can justify bumping the writers, as well. We don't have plans to do this anytime soon, but there is no reason not to start the clock ticking now. Just bumping all of the reading bufs to LARGE_PACKET_MAX would waste memory. Instead, since most readers just read into a temporary buffer anyway, let's provide a single static buffer that all callers can use. We can further wrap this detail away by having the packet_read_line wrapper just use the buffer transparently and return a pointer to the static storage. That covers most of the cases, and the remaining ones already read into their own LARGE_PACKET_MAX buffers. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-02-20 21:02:57 +01:00
int len;
const char *arg;
if (packet_reader_read(reader) != PACKET_READ_NORMAL)
die(_("git fetch-pack: expected ACK/NAK, got a flush packet"));
len = reader->pktlen;
if (!strcmp(reader->line, "NAK"))
return NAK;
if (skip_prefix(reader->line, "ACK ", &arg)) {
const char *p;
if (!parse_oid_hex(arg, result_oid, &p)) {
len -= p - reader->line;
if (len < 1)
return ACK;
if (strstr(p, "continue"))
return ACK_continue;
if (strstr(p, "common"))
return ACK_common;
if (strstr(p, "ready"))
return ACK_ready;
return ACK;
}
}
die(_("git fetch-pack: expected ACK/NAK, got '%s'"), reader->line);
}
static void send_request(struct fetch_pack_args *args,
int fd, struct strbuf *buf)
{
if (args->stateless_rpc) {
send_sideband(fd, -1, buf->buf, buf->len, LARGE_PACKET_MAX);
packet_flush(fd);
} else {
if (write_in_full(fd, buf->buf, buf->len) < 0)
die_errno(_("unable to write to remote"));
}
}
static void insert_one_alternate_object(struct fetch_negotiator *negotiator,
struct object *obj)
{
rev_list_insert_ref(negotiator, &obj->oid);
}
#define INITIAL_FLUSH 16
#define PIPESAFE_FLUSH 32
#define LARGE_FLUSH 16384
static int next_flush(int stateless_rpc, int count)
{
if (stateless_rpc) {
if (count < LARGE_FLUSH)
count <<= 1;
else
count = count * 11 / 10;
} else {
if (count < PIPESAFE_FLUSH)
count <<= 1;
else
count += PIPESAFE_FLUSH;
}
return count;
}
static void mark_tips(struct fetch_negotiator *negotiator,
const struct oid_array *negotiation_tips)
{
int i;
if (!negotiation_tips) {
for_each_rawref(rev_list_insert_ref_oid, negotiator);
return;
}
for (i = 0; i < negotiation_tips->nr; i++)
rev_list_insert_ref(negotiator, &negotiation_tips->oid[i]);
return;
}
static void send_filter(struct fetch_pack_args *args,
struct strbuf *req_buf,
int server_supports_filter)
{
if (args->filter_options.choice) {
const char *spec =
expand_list_objects_filter_spec(&args->filter_options);
if (server_supports_filter) {
print_verbose(args, _("Server supports filter"));
packet_buf_write(req_buf, "filter %s", spec);
trace2_data_string("fetch", the_repository,
"filter/effective", spec);
} else {
warning("filtering not recognized by server, ignoring");
trace2_data_string("fetch", the_repository,
"filter/unsupported", spec);
}
} else {
trace2_data_string("fetch", the_repository,
"filter/none", "");
}
}
static int find_common(struct fetch_negotiator *negotiator,
struct fetch_pack_args *args,
int fd[2], struct object_id *result_oid,
struct ref *refs)
{
int fetching;
int count = 0, flushes = 0, flush_at = INITIAL_FLUSH, retval;
int negotiation_round = 0, haves = 0;
const struct object_id *oid;
unsigned in_vain = 0;
int got_continue = 0;
int got_ready = 0;
struct strbuf req_buf = STRBUF_INIT;
size_t state_len = 0;
struct packet_reader reader;
if (args->stateless_rpc && multi_ack == 1)
die(_("the option '%s' requires '%s'"), "--stateless-rpc", "multi_ack_detailed");
packet_reader_init(&reader, fd[0], NULL, 0,
PACKET_READ_CHOMP_NEWLINE |
PACKET_READ_DIE_ON_ERR_PACKET);
mark_tips(negotiator, args->negotiation_tips);
for_each_cached_alternate(negotiator, insert_one_alternate_object);
fetching = 0;
for ( ; refs ; refs = refs->next) {
struct object_id *remote = &refs->old_oid;
const char *remote_hex;
struct object *o;
if (!args->refetch) {
/*
* If that object is complete (i.e. it is an ancestor of a
* local ref), we tell them we have it but do not have to
* tell them about its ancestors, which they already know
* about.
*
* We use lookup_object here because we are only
* interested in the case we *know* the object is
* reachable and we have already scanned it.
*/
if (((o = lookup_object(the_repository, remote)) != NULL) &&
(o->flags & COMPLETE)) {
continue;
}
}
remote_hex = oid_to_hex(remote);
if (!fetching) {
struct strbuf c = STRBUF_INIT;
if (multi_ack == 2) strbuf_addstr(&c, " multi_ack_detailed");
if (multi_ack == 1) strbuf_addstr(&c, " multi_ack");
if (no_done) strbuf_addstr(&c, " no-done");
if (use_sideband == 2) strbuf_addstr(&c, " side-band-64k");
if (use_sideband == 1) strbuf_addstr(&c, " side-band");
fetch, upload-pack: --deepen=N extends shallow boundary by N commits In git-fetch, --depth argument is always relative with the latest remote refs. This makes it a bit difficult to cover this use case, where the user wants to make the shallow history, say 3 levels deeper. It would work if remote refs have not moved yet, but nobody can guarantee that, especially when that use case is performed a couple months after the last clone or "git fetch --depth". Also, modifying shallow boundary using --depth does not work well with clones created by --since or --not. This patch fixes that. A new argument --deepen=<N> will add <N> more (*) parent commits to the current history regardless of where remote refs are. Have/Want negotiation is still respected. So if remote refs move, the server will send two chunks: one between "have" and "want" and another to extend shallow history. In theory, the client could send no "want"s in order to get the second chunk only. But the protocol does not allow that. Either you send no want lines, which means ls-remote; or you have to send at least one want line that carries deep-relative to the server.. The main work was done by Dongcan Jiang. I fixed it up here and there. And of course all the bugs belong to me. (*) We could even support --deepen=<N> where <N> is negative. In that case we can cut some history from the shallow clone. This operation (and --depth=<shorter depth>) does not require interaction with remote side (and more complicated to implement as a result). Helped-by: Duy Nguyen <pclouds@gmail.com> Helped-by: Eric Sunshine <sunshine@sunshineco.com> Helped-by: Junio C Hamano <gitster@pobox.com> Signed-off-by: Dongcan Jiang <dongcan.jiang@gmail.com> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-06-12 12:54:09 +02:00
if (args->deepen_relative) strbuf_addstr(&c, " deepen-relative");
if (args->use_thin_pack) strbuf_addstr(&c, " thin-pack");
if (args->no_progress) strbuf_addstr(&c, " no-progress");
if (args->include_tag) strbuf_addstr(&c, " include-tag");
if (prefer_ofs_delta) strbuf_addstr(&c, " ofs-delta");
if (deepen_since_ok) strbuf_addstr(&c, " deepen-since");
if (deepen_not_ok) strbuf_addstr(&c, " deepen-not");
if (agent_supported) strbuf_addf(&c, " agent=%s",
git_user_agent_sanitized());
if (advertise_sid)
strbuf_addf(&c, " session-id=%s", trace2_session_id());
if (args->filter_options.choice)
strbuf_addstr(&c, " filter");
packet_buf_write(&req_buf, "want %s%s\n", remote_hex, c.buf);
strbuf_release(&c);
} else
packet_buf_write(&req_buf, "want %s\n", remote_hex);
fetching++;
}
if (!fetching) {
strbuf_release(&req_buf);
packet_flush(fd[1]);
return 1;
}
if (is_repository_shallow(the_repository))
write_shallow_commits(&req_buf, 1, NULL);
if (args->depth > 0)
packet_buf_write(&req_buf, "deepen %d", args->depth);
if (args->deepen_since) {
timestamp_t max_age = approxidate(args->deepen_since);
packet_buf_write(&req_buf, "deepen-since %"PRItime, max_age);
}
if (args->deepen_not) {
int i;
for (i = 0; i < args->deepen_not->nr; i++) {
struct string_list_item *s = args->deepen_not->items + i;
packet_buf_write(&req_buf, "deepen-not %s", s->string);
}
}
send_filter(args, &req_buf, server_supports_filtering);
packet_buf_flush(&req_buf);
state_len = req_buf.len;
if (args->deepen) {
const char *arg;
struct object_id oid;
send_request(args, fd[1], &req_buf);
while (packet_reader_read(&reader) == PACKET_READ_NORMAL) {
if (skip_prefix(reader.line, "shallow ", &arg)) {
if (get_oid_hex(arg, &oid))
die(_("invalid shallow line: %s"), reader.line);
register_shallow(the_repository, &oid);
continue;
}
if (skip_prefix(reader.line, "unshallow ", &arg)) {
if (get_oid_hex(arg, &oid))
die(_("invalid unshallow line: %s"), reader.line);
if (!lookup_object(the_repository, &oid))
die(_("object not found: %s"), reader.line);
/* make sure that it is parsed as shallow */
if (!parse_object(the_repository, &oid))
die(_("error in object: %s"), reader.line);
if (unregister_shallow(&oid))
die(_("no shallow found: %s"), reader.line);
continue;
}
die(_("expected shallow/unshallow, got %s"), reader.line);
}
} else if (!args->stateless_rpc)
send_request(args, fd[1], &req_buf);
if (!args->stateless_rpc) {
/* If we aren't using the stateless-rpc interface
* we don't need to retain the headers.
*/
strbuf_setlen(&req_buf, 0);
state_len = 0;
}
trace2_region_enter("fetch-pack", "negotiation_v0_v1", the_repository);
flushes = 0;
retval = -1;
while ((oid = negotiator->next(negotiator))) {
packet_buf_write(&req_buf, "have %s\n", oid_to_hex(oid));
print_verbose(args, "have %s", oid_to_hex(oid));
in_vain++;
haves++;
if (flush_at <= ++count) {
int ack;
negotiation_round++;
trace2_region_enter_printf("negotiation_v0_v1", "round",
the_repository, "%d",
negotiation_round);
trace2_data_intmax("negotiation_v0_v1", the_repository,
"haves_added", haves);
trace2_data_intmax("negotiation_v0_v1", the_repository,
"in_vain", in_vain);
haves = 0;
packet_buf_flush(&req_buf);
send_request(args, fd[1], &req_buf);
strbuf_setlen(&req_buf, state_len);
flushes++;
flush_at = next_flush(args->stateless_rpc, count);
/*
* We keep one window "ahead" of the other side, and
* will wait for an ACK only on the next one
*/
if (!args->stateless_rpc && count == INITIAL_FLUSH)
continue;
consume_shallow_list(args, &reader);
do {
ack = get_ack(&reader, result_oid);
if (ack)
print_verbose(args, _("got %s %d %s"), "ack",
ack, oid_to_hex(result_oid));
switch (ack) {
case ACK:
trace2_region_leave_printf("negotiation_v0_v1", "round",
the_repository, "%d",
negotiation_round);
flushes = 0;
multi_ack = 0;
retval = 0;
goto done;
case ACK_common:
case ACK_ready:
case ACK_continue: {
struct commit *commit =
lookup_commit(the_repository,
result_oid);
int was_common;
if (!commit)
die(_("invalid commit %s"), oid_to_hex(result_oid));
was_common = negotiator->ack(negotiator, commit);
if (args->stateless_rpc
&& ack == ACK_common
&& !was_common) {
/* We need to replay the have for this object
* on the next RPC request so the peer knows
* it is in common with us.
*/
const char *hex = oid_to_hex(result_oid);
packet_buf_write(&req_buf, "have %s\n", hex);
state_len = req_buf.len;
haves++;
/*
* Reset in_vain because an ack
* for this commit has not been
* seen.
*/
in_vain = 0;
} else if (!args->stateless_rpc
|| ack != ACK_common)
in_vain = 0;
retval = 0;
got_continue = 1;
if (ack == ACK_ready)
got_ready = 1;
break;
}
}