2023-05-16 08:33:57 +02:00
|
|
|
#include "git-compat-util.h"
|
2017-05-24 07:15:34 +02:00
|
|
|
#include "refs.h"
|
2023-05-16 08:34:06 +02:00
|
|
|
#include "object-store-ll.h"
|
2017-05-24 07:15:34 +02:00
|
|
|
#include "cache-tree.h"
|
2017-05-24 07:15:35 +02:00
|
|
|
#include "mergesort.h"
|
2023-04-11 05:00:40 +02:00
|
|
|
#include "convert.h"
|
2017-05-24 07:15:35 +02:00
|
|
|
#include "diff.h"
|
|
|
|
#include "diffcore.h"
|
2023-03-21 07:25:54 +01:00
|
|
|
#include "gettext.h"
|
2023-02-24 01:09:27 +01:00
|
|
|
#include "hex.h"
|
2023-05-16 08:33:59 +02:00
|
|
|
#include "path.h"
|
2023-05-16 08:33:56 +02:00
|
|
|
#include "read-cache.h"
|
2023-03-21 07:26:05 +01:00
|
|
|
#include "setup.h"
|
2017-05-24 07:15:36 +02:00
|
|
|
#include "tag.h"
|
2023-04-11 05:00:38 +02:00
|
|
|
#include "trace2.h"
|
2017-05-24 07:15:33 +02:00
|
|
|
#include "blame.h"
|
2018-05-15 23:48:42 +02:00
|
|
|
#include "alloc.h"
|
2018-05-19 07:28:19 +02:00
|
|
|
#include "commit-slab.h"
|
blame: use changed-path Bloom filters
The changed-path Bloom filters help reduce the amount of tree
parsing required during history queries. Before calculating a
diff, we can ask the filter if a path changed between a commit
and its first parent. If the filter says "no" then we can move
on without parsing trees. If the filter says "maybe" then we
parse trees to discover if the answer is actually "yes" or "no".
When computing a blame, there is a section in find_origin() that
computes a diff between a commit and one of its parents. When this
is the first parent, we can check the Bloom filters before calling
diff_tree_oid().
In order to make this work with the blame machinery, we need to
initialize a struct bloom_key with the initial path. But also, we
need to add more keys to a list if a rename is detected. We then
check to see if _any_ of these keys answer "maybe" in the diff.
During development, I purposefully left out this "add a new key
when a rename is detected" to see if the test suite would catch
my error. That is how I discovered the issues with
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS from the previous change.
With that change, we can feel some confidence in the coverage of
this change.
If a user requests copy detection using "git blame -C", then there
are more places where the set of "important" files can expand. I
do not know enough about how this happens in the blame machinery.
Thus, the Bloom filter integration is explicitly disabled in this
mode. A later change could expand the bloom_key data with an
appropriate call (or calls) to add_bloom_key().
If we did not disable this mode, then the following tests would
fail:
t8003-blame-corner-cases.sh
t8011-blame-split-file.sh
Generally, this is a performance enhancement and should not
change the behavior of 'git blame' in any way. If a repo has a
commit-graph file with computed changed-path Bloom filters, then
they should notice improved performance for their 'git blame'
commands.
Here are some example timings that I found by blaming some paths
in the Linux kernel repository:
git blame arch/x86/kernel/topology.c >/dev/null
Before: 0.83s
After: 0.24s
git blame kernel/time/time.c >/dev/null
Before: 0.72s
After: 0.24s
git blame tools/perf/ui/stdio/hist.c >/dev/null
Before: 0.27s
After: 0.11s
I specifically looked for "deep" paths that were also edited many
times. As a counterpoint, the MAINTAINERS file was edited many
times but is located in the root tree. This means that the cost of
computing a diff relative to the pathspec is very small. Here are
the timings for that command:
git blame MAINTAINERS >/dev/null
Before: 20.1s
After: 18.0s
These timings are the best of five. The worst-case runs were on the
order of 2.5 minutes for both cases. Note that the MAINTAINERS file
has 18,740 lines across 17,000+ commits. This happens to be one of
the cases where this change provides the least improvement.
The lack of improvement for the MAINTAINERS file and the relatively
modest improvement for the other examples can be easily explained.
The blame machinery needs to compute line-level diffs to determine
which lines were changed by each commit. That makes up a large
proportion of the computation time, and this change does not
attempt to improve on that section of the algorithm. The
MAINTAINERS file is large and changed often, so it takes time to
determine which lines were updated by which commit. In contrast,
the code files are much smaller, and it takes longer to comute
the line-by-line diff for a single patch on the Linux mailing
lists.
Outside of the "-C" integration, I believe there is little more to
gain from the changed-path Bloom filters for 'git blame' after this
patch.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-16 22:14:04 +02:00
|
|
|
#include "bloom.h"
|
|
|
|
#include "commit-graph.h"
|
2018-05-19 07:28:19 +02:00
|
|
|
|
|
|
|
define_commit_slab(blame_suspects, struct blame_origin *);
|
|
|
|
static struct blame_suspects blame_suspects;
|
|
|
|
|
|
|
|
struct blame_origin *get_blame_suspects(struct commit *commit)
|
|
|
|
{
|
|
|
|
struct blame_origin **result;
|
|
|
|
|
|
|
|
result = blame_suspects_peek(&blame_suspects, commit);
|
|
|
|
|
|
|
|
return result ? *result : NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_blame_suspects(struct commit *commit, struct blame_origin *origin)
|
|
|
|
{
|
|
|
|
*blame_suspects_at(&blame_suspects, commit) = origin;
|
|
|
|
}
|
2017-05-24 07:15:33 +02:00
|
|
|
|
|
|
|
void blame_origin_decref(struct blame_origin *o)
|
|
|
|
{
|
|
|
|
if (o && --o->refcnt <= 0) {
|
|
|
|
struct blame_origin *p, *l = NULL;
|
|
|
|
if (o->previous)
|
|
|
|
blame_origin_decref(o->previous);
|
|
|
|
free(o->file.ptr);
|
|
|
|
/* Should be present exactly once in commit chain */
|
2018-05-19 07:28:19 +02:00
|
|
|
for (p = get_blame_suspects(o->commit); p; l = p, p = p->next) {
|
2017-05-24 07:15:33 +02:00
|
|
|
if (p == o) {
|
|
|
|
if (l)
|
|
|
|
l->next = p->next;
|
|
|
|
else
|
2018-05-19 07:28:19 +02:00
|
|
|
set_blame_suspects(o->commit, p->next);
|
2017-05-24 07:15:33 +02:00
|
|
|
free(o);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
die("internal error in blame_origin_decref");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given a commit and a path in it, create a new origin structure.
|
|
|
|
* The callers that add blame to the scoreboard should use
|
|
|
|
* get_origin() to obtain shared, refcounted copy instead of calling
|
|
|
|
* this function directly.
|
|
|
|
*/
|
2017-05-24 07:15:34 +02:00
|
|
|
static struct blame_origin *make_origin(struct commit *commit, const char *path)
|
2017-05-24 07:15:33 +02:00
|
|
|
{
|
|
|
|
struct blame_origin *o;
|
|
|
|
FLEX_ALLOC_STR(o, path, path);
|
|
|
|
o->commit = commit;
|
|
|
|
o->refcnt = 1;
|
2018-05-19 07:28:19 +02:00
|
|
|
o->next = get_blame_suspects(commit);
|
|
|
|
set_blame_suspects(commit, o);
|
2017-05-24 07:15:33 +02:00
|
|
|
return o;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Locate an existing origin or create a new one.
|
|
|
|
* This moves the origin to front position in the commit util list.
|
|
|
|
*/
|
2017-05-24 07:15:36 +02:00
|
|
|
static struct blame_origin *get_origin(struct commit *commit, const char *path)
|
2017-05-24 07:15:33 +02:00
|
|
|
{
|
|
|
|
struct blame_origin *o, *l;
|
|
|
|
|
2018-05-19 07:28:19 +02:00
|
|
|
for (o = get_blame_suspects(commit), l = NULL; o; l = o, o = o->next) {
|
2017-05-24 07:15:33 +02:00
|
|
|
if (!strcmp(o->path, path)) {
|
|
|
|
/* bump to front */
|
|
|
|
if (l) {
|
|
|
|
l->next = o->next;
|
2018-05-19 07:28:19 +02:00
|
|
|
o->next = get_blame_suspects(commit);
|
|
|
|
set_blame_suspects(commit, o);
|
2017-05-24 07:15:33 +02:00
|
|
|
}
|
|
|
|
return blame_origin_incref(o);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return make_origin(commit, path);
|
|
|
|
}
|
2017-05-24 07:15:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2018-09-21 17:57:21 +02:00
|
|
|
static void verify_working_tree_path(struct repository *r,
|
2018-08-13 18:14:41 +02:00
|
|
|
struct commit *work_tree, const char *path)
|
2017-05-24 07:15:34 +02:00
|
|
|
{
|
|
|
|
struct commit_list *parents;
|
|
|
|
int pos;
|
|
|
|
|
|
|
|
for (parents = work_tree->parents; parents; parents = parents->next) {
|
|
|
|
const struct object_id *commit_oid = &parents->item->object.oid;
|
|
|
|
struct object_id blob_oid;
|
2019-04-05 17:00:12 +02:00
|
|
|
unsigned short mode;
|
2017-05-24 07:15:34 +02:00
|
|
|
|
2019-06-27 11:28:49 +02:00
|
|
|
if (!get_tree_entry(r, commit_oid, path, &blob_oid, &mode) &&
|
2018-09-21 17:57:21 +02:00
|
|
|
oid_object_info(r, &blob_oid, NULL) == OBJ_BLOB)
|
2017-05-24 07:15:34 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-09-21 17:57:21 +02:00
|
|
|
pos = index_name_pos(r->index, path, strlen(path));
|
2017-05-24 07:15:34 +02:00
|
|
|
if (pos >= 0)
|
|
|
|
; /* path is in the index */
|
2018-09-21 17:57:21 +02:00
|
|
|
else if (-1 - pos < r->index->cache_nr &&
|
|
|
|
!strcmp(r->index->cache[-1 - pos]->name, path))
|
2017-05-24 07:15:34 +02:00
|
|
|
; /* path is in the index, unmerged */
|
|
|
|
else
|
|
|
|
die("no such path '%s' in HEAD", path);
|
|
|
|
}
|
|
|
|
|
2018-11-10 06:48:58 +01:00
|
|
|
static struct commit_list **append_parent(struct repository *r,
|
|
|
|
struct commit_list **tail,
|
|
|
|
const struct object_id *oid)
|
2017-05-24 07:15:34 +02:00
|
|
|
{
|
|
|
|
struct commit *parent;
|
|
|
|
|
2018-11-10 06:48:58 +01:00
|
|
|
parent = lookup_commit_reference(r, oid);
|
2017-05-24 07:15:34 +02:00
|
|
|
if (!parent)
|
|
|
|
die("no such commit %s", oid_to_hex(oid));
|
|
|
|
return &commit_list_insert(parent, tail)->next;
|
|
|
|
}
|
|
|
|
|
2018-11-10 06:48:58 +01:00
|
|
|
static void append_merge_parents(struct repository *r,
|
|
|
|
struct commit_list **tail)
|
2017-05-24 07:15:34 +02:00
|
|
|
{
|
|
|
|
int merge_head;
|
|
|
|
struct strbuf line = STRBUF_INIT;
|
|
|
|
|
2018-11-10 06:48:58 +01:00
|
|
|
merge_head = open(git_path_merge_head(r), O_RDONLY);
|
2017-05-24 07:15:34 +02:00
|
|
|
if (merge_head < 0) {
|
|
|
|
if (errno == ENOENT)
|
|
|
|
return;
|
2018-05-18 00:51:51 +02:00
|
|
|
die("cannot open '%s' for reading",
|
2018-11-10 06:48:58 +01:00
|
|
|
git_path_merge_head(r));
|
2017-05-24 07:15:34 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
while (!strbuf_getwholeline_fd(&line, merge_head, '\n')) {
|
|
|
|
struct object_id oid;
|
2019-08-18 22:04:08 +02:00
|
|
|
if (get_oid_hex(line.buf, &oid))
|
2018-05-18 00:51:51 +02:00
|
|
|
die("unknown line in '%s': %s",
|
2018-11-10 06:48:58 +01:00
|
|
|
git_path_merge_head(r), line.buf);
|
|
|
|
tail = append_parent(r, tail, &oid);
|
2017-05-24 07:15:34 +02:00
|
|
|
}
|
|
|
|
close(merge_head);
|
|
|
|
strbuf_release(&line);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This isn't as simple as passing sb->buf and sb->len, because we
|
|
|
|
* want to transfer ownership of the buffer to the commit (so we
|
|
|
|
* must use detach).
|
|
|
|
*/
|
2018-11-10 06:48:58 +01:00
|
|
|
static void set_commit_buffer_from_strbuf(struct repository *r,
|
|
|
|
struct commit *c,
|
|
|
|
struct strbuf *sb)
|
2017-05-24 07:15:34 +02:00
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
void *buf = strbuf_detach(sb, &len);
|
2018-11-10 06:48:58 +01:00
|
|
|
set_commit_buffer(r, c, buf, len);
|
2017-05-24 07:15:34 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare a dummy commit that represents the work tree (or staged) item.
|
|
|
|
* Note that annotating work tree item never works in the reverse.
|
|
|
|
*/
|
2018-09-21 17:57:21 +02:00
|
|
|
static struct commit *fake_working_tree_commit(struct repository *r,
|
2018-08-13 18:14:41 +02:00
|
|
|
struct diff_options *opt,
|
2017-05-24 07:15:36 +02:00
|
|
|
const char *path,
|
2023-03-24 18:08:00 +01:00
|
|
|
const char *contents_from,
|
|
|
|
struct object_id *oid)
|
2017-05-24 07:15:34 +02:00
|
|
|
{
|
|
|
|
struct commit *commit;
|
|
|
|
struct blame_origin *origin;
|
|
|
|
struct commit_list **parent_tail, *parent;
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
|
|
|
const char *ident;
|
|
|
|
time_t now;
|
2018-07-02 21:49:31 +02:00
|
|
|
int len;
|
2017-05-24 07:15:34 +02:00
|
|
|
struct cache_entry *ce;
|
|
|
|
unsigned mode;
|
|
|
|
struct strbuf msg = STRBUF_INIT;
|
|
|
|
|
2019-01-12 03:13:26 +01:00
|
|
|
repo_read_index(r);
|
2017-05-24 07:15:34 +02:00
|
|
|
time(&now);
|
2018-11-10 06:48:58 +01:00
|
|
|
commit = alloc_commit_node(r);
|
2017-05-24 07:15:34 +02:00
|
|
|
commit->object.parsed = 1;
|
|
|
|
commit->date = now;
|
|
|
|
parent_tail = &commit->parents;
|
|
|
|
|
2023-03-24 18:08:00 +01:00
|
|
|
parent_tail = append_parent(r, parent_tail, oid);
|
2018-11-10 06:48:58 +01:00
|
|
|
append_merge_parents(r, parent_tail);
|
2018-09-21 17:57:21 +02:00
|
|
|
verify_working_tree_path(r, commit, path);
|
2017-05-24 07:15:34 +02:00
|
|
|
|
|
|
|
origin = make_origin(commit, path);
|
|
|
|
|
2023-04-24 21:35:08 +02:00
|
|
|
if (contents_from)
|
|
|
|
ident = fmt_ident("External file (--contents)", "external.file",
|
|
|
|
WANT_BLANK_IDENT, NULL, 0);
|
|
|
|
else
|
|
|
|
ident = fmt_ident("Not Committed Yet", "not.committed.yet",
|
|
|
|
WANT_BLANK_IDENT, NULL, 0);
|
2017-05-24 07:15:34 +02:00
|
|
|
strbuf_addstr(&msg, "tree 0000000000000000000000000000000000000000\n");
|
|
|
|
for (parent = commit->parents; parent; parent = parent->next)
|
|
|
|
strbuf_addf(&msg, "parent %s\n",
|
|
|
|
oid_to_hex(&parent->item->object.oid));
|
|
|
|
strbuf_addf(&msg,
|
|
|
|
"author %s\n"
|
|
|
|
"committer %s\n\n"
|
|
|
|
"Version of %s from %s\n",
|
|
|
|
ident, ident, path,
|
|
|
|
(!contents_from ? path :
|
|
|
|
(!strcmp(contents_from, "-") ? "standard input" : contents_from)));
|
2018-11-10 06:48:58 +01:00
|
|
|
set_commit_buffer_from_strbuf(r, commit, &msg);
|
2017-05-24 07:15:34 +02:00
|
|
|
|
|
|
|
if (!contents_from || strcmp("-", contents_from)) {
|
|
|
|
struct stat st;
|
|
|
|
const char *read_from;
|
|
|
|
char *buf_ptr;
|
|
|
|
unsigned long buf_len;
|
|
|
|
|
|
|
|
if (contents_from) {
|
|
|
|
if (stat(contents_from, &st) < 0)
|
|
|
|
die_errno("Cannot stat '%s'", contents_from);
|
|
|
|
read_from = contents_from;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (lstat(path, &st) < 0)
|
|
|
|
die_errno("Cannot lstat '%s'", path);
|
|
|
|
read_from = path;
|
|
|
|
}
|
|
|
|
mode = canon_mode(st.st_mode);
|
|
|
|
|
|
|
|
switch (st.st_mode & S_IFMT) {
|
|
|
|
case S_IFREG:
|
2017-10-31 19:19:11 +01:00
|
|
|
if (opt->flags.allow_textconv &&
|
2021-04-26 03:02:56 +02:00
|
|
|
textconv_object(r, read_from, mode, null_oid(), 0, &buf_ptr, &buf_len))
|
2017-05-24 07:15:34 +02:00
|
|
|
strbuf_attach(&buf, buf_ptr, buf_len, buf_len + 1);
|
|
|
|
else if (strbuf_read_file(&buf, read_from, st.st_size) != st.st_size)
|
|
|
|
die_errno("cannot open or read '%s'", read_from);
|
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
|
|
|
if (strbuf_readlink(&buf, read_from, st.st_size) < 0)
|
|
|
|
die_errno("cannot readlink '%s'", read_from);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
die("unsupported file type %s", read_from);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* Reading from stdin */
|
|
|
|
mode = 0;
|
|
|
|
if (strbuf_read(&buf, 0, 0) < 0)
|
|
|
|
die_errno("failed to read from stdin");
|
|
|
|
}
|
2018-09-21 17:57:21 +02:00
|
|
|
convert_to_git(r->index, path, buf.buf, buf.len, &buf, 0);
|
2017-05-24 07:15:34 +02:00
|
|
|
origin->file.ptr = buf.buf;
|
|
|
|
origin->file.size = buf.len;
|
2018-01-28 01:13:11 +01:00
|
|
|
pretend_object_file(buf.buf, buf.len, OBJ_BLOB, &origin->blob_oid);
|
2017-05-24 07:15:34 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the current index, replace the path entry with
|
|
|
|
* origin->blob_sha1 without mucking with its mode or type
|
|
|
|
* bits; we are not going to write this index out -- we just
|
|
|
|
* want to run "diff-index --cached".
|
|
|
|
*/
|
2018-09-21 17:57:21 +02:00
|
|
|
discard_index(r->index);
|
2019-01-12 03:13:26 +01:00
|
|
|
repo_read_index(r);
|
2017-05-24 07:15:34 +02:00
|
|
|
|
|
|
|
len = strlen(path);
|
|
|
|
if (!mode) {
|
2018-09-21 17:57:21 +02:00
|
|
|
int pos = index_name_pos(r->index, path, len);
|
2017-05-24 07:15:34 +02:00
|
|
|
if (0 <= pos)
|
2018-09-21 17:57:21 +02:00
|
|
|
mode = r->index->cache[pos]->ce_mode;
|
2017-05-24 07:15:34 +02:00
|
|
|
else
|
|
|
|
/* Let's not bother reading from HEAD tree */
|
|
|
|
mode = S_IFREG | 0644;
|
|
|
|
}
|
2018-09-21 17:57:21 +02:00
|
|
|
ce = make_empty_cache_entry(r->index, len);
|
2017-05-24 07:15:34 +02:00
|
|
|
oidcpy(&ce->oid, &origin->blob_oid);
|
|
|
|
memcpy(ce->name, path, len);
|
|
|
|
ce->ce_flags = create_ce_flags(0);
|
|
|
|
ce->ce_namelen = len;
|
|
|
|
ce->ce_mode = create_ce_mode(mode);
|
2018-09-21 17:57:21 +02:00
|
|
|
add_index_entry(r->index, ce,
|
2018-08-13 18:14:41 +02:00
|
|
|
ADD_CACHE_OK_TO_ADD | ADD_CACHE_OK_TO_REPLACE);
|
2017-05-24 07:15:34 +02:00
|
|
|
|
2018-09-21 17:57:21 +02:00
|
|
|
cache_tree_invalidate_path(r->index, path);
|
2017-05-24 07:15:34 +02:00
|
|
|
|
|
|
|
return commit;
|
|
|
|
}
|
2017-05-24 07:15:35 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b,
|
|
|
|
xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts)
|
|
|
|
{
|
|
|
|
xpparam_t xpp = {0};
|
|
|
|
xdemitconf_t xecfg = {0};
|
|
|
|
xdemitcb_t ecb = {NULL};
|
|
|
|
|
|
|
|
xpp.flags = xdl_opts;
|
|
|
|
xecfg.hunk_func = hunk_func;
|
|
|
|
ecb.priv = cb_data;
|
|
|
|
return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb);
|
|
|
|
}
|
|
|
|
|
2019-05-15 23:45:01 +02:00
|
|
|
static const char *get_next_line(const char *start, const char *end)
|
|
|
|
{
|
|
|
|
const char *nl = memchr(start, '\n', end - start);
|
|
|
|
|
|
|
|
return nl ? nl + 1 : end;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int find_line_starts(int **line_starts, const char *buf,
|
|
|
|
unsigned long len)
|
|
|
|
{
|
|
|
|
const char *end = buf + len;
|
|
|
|
const char *p;
|
|
|
|
int *lineno;
|
|
|
|
int num = 0;
|
|
|
|
|
|
|
|
for (p = buf; p < end; p = get_next_line(p, end))
|
|
|
|
num++;
|
|
|
|
|
|
|
|
ALLOC_ARRAY(*line_starts, num + 1);
|
|
|
|
lineno = *line_starts;
|
|
|
|
|
|
|
|
for (p = buf; p < end; p = get_next_line(p, end))
|
|
|
|
*lineno++ = p - buf;
|
|
|
|
|
|
|
|
*lineno = len;
|
|
|
|
|
|
|
|
return num;
|
|
|
|
}
|
|
|
|
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
struct fingerprint_entry;
|
|
|
|
|
|
|
|
/* A fingerprint is intended to loosely represent a string, such that two
|
|
|
|
* fingerprints can be quickly compared to give an indication of the similarity
|
|
|
|
* of the strings that they represent.
|
|
|
|
*
|
|
|
|
* A fingerprint is represented as a multiset of the lower-cased byte pairs in
|
|
|
|
* the string that it represents. Whitespace is added at each end of the
|
|
|
|
* string. Whitespace pairs are ignored. Whitespace is converted to '\0'.
|
|
|
|
* For example, the string "Darth Radar" will be converted to the following
|
|
|
|
* fingerprint:
|
|
|
|
* {"\0d", "da", "da", "ar", "ar", "rt", "th", "h\0", "\0r", "ra", "ad", "r\0"}
|
|
|
|
*
|
|
|
|
* The similarity between two fingerprints is the size of the intersection of
|
|
|
|
* their multisets, including repeated elements. See fingerprint_similarity for
|
|
|
|
* examples.
|
|
|
|
*
|
|
|
|
* For ease of implementation, the fingerprint is implemented as a map
|
|
|
|
* of byte pairs to the count of that byte pair in the string, instead of
|
|
|
|
* allowing repeated elements in a set.
|
|
|
|
*/
|
|
|
|
struct fingerprint {
|
|
|
|
struct hashmap map;
|
|
|
|
/* As we know the maximum number of entries in advance, it's
|
|
|
|
* convenient to store the entries in a single array instead of having
|
|
|
|
* the hashmap manage the memory.
|
|
|
|
*/
|
|
|
|
struct fingerprint_entry *entries;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* A byte pair in a fingerprint. Stores the number of times the byte pair
|
|
|
|
* occurs in the string that the fingerprint represents.
|
|
|
|
*/
|
|
|
|
struct fingerprint_entry {
|
|
|
|
/* The hashmap entry - the hash represents the byte pair in its
|
|
|
|
* entirety so we don't need to store the byte pair separately.
|
|
|
|
*/
|
|
|
|
struct hashmap_entry entry;
|
|
|
|
/* The number of times the byte pair occurs in the string that the
|
|
|
|
* fingerprint represents.
|
|
|
|
*/
|
|
|
|
int count;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* See `struct fingerprint` for an explanation of what a fingerprint is.
|
|
|
|
* \param result the fingerprint of the string is stored here. This must be
|
|
|
|
* freed later using free_fingerprint.
|
|
|
|
* \param line_begin the start of the string
|
|
|
|
* \param line_end the end of the string
|
|
|
|
*/
|
|
|
|
static void get_fingerprint(struct fingerprint *result,
|
|
|
|
const char *line_begin,
|
|
|
|
const char *line_end)
|
|
|
|
{
|
|
|
|
unsigned int hash, c0 = 0, c1;
|
|
|
|
const char *p;
|
|
|
|
int max_map_entry_count = 1 + line_end - line_begin;
|
|
|
|
struct fingerprint_entry *entry = xcalloc(max_map_entry_count,
|
|
|
|
sizeof(struct fingerprint_entry));
|
|
|
|
struct fingerprint_entry *found_entry;
|
|
|
|
|
|
|
|
hashmap_init(&result->map, NULL, NULL, max_map_entry_count);
|
|
|
|
result->entries = entry;
|
|
|
|
for (p = line_begin; p <= line_end; ++p, c0 = c1) {
|
|
|
|
/* Always terminate the string with whitespace.
|
|
|
|
* Normalise whitespace to 0, and normalise letters to
|
|
|
|
* lower case. This won't work for multibyte characters but at
|
|
|
|
* worst will match some unrelated characters.
|
|
|
|
*/
|
|
|
|
if ((p == line_end) || isspace(*p))
|
|
|
|
c1 = 0;
|
|
|
|
else
|
|
|
|
c1 = tolower(*p);
|
|
|
|
hash = c0 | (c1 << 8);
|
|
|
|
/* Ignore whitespace pairs */
|
|
|
|
if (hash == 0)
|
|
|
|
continue;
|
2019-10-07 01:30:27 +02:00
|
|
|
hashmap_entry_init(&entry->entry, hash);
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
|
2019-10-07 01:30:42 +02:00
|
|
|
found_entry = hashmap_get_entry(&result->map, entry,
|
|
|
|
/* member name */ entry, NULL);
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
if (found_entry) {
|
|
|
|
found_entry->count += 1;
|
|
|
|
} else {
|
|
|
|
entry->count = 1;
|
2019-10-07 01:30:29 +02:00
|
|
|
hashmap_add(&result->map, &entry->entry);
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
++entry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_fingerprint(struct fingerprint *f)
|
|
|
|
{
|
2020-11-02 19:55:05 +01:00
|
|
|
hashmap_clear(&f->map);
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
free(f->entries);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Calculates the similarity between two fingerprints as the size of the
|
|
|
|
* intersection of their multisets, including repeated elements. See
|
|
|
|
* `struct fingerprint` for an explanation of the fingerprint representation.
|
|
|
|
* The similarity between "cat mat" and "father rather" is 2 because "at" is
|
|
|
|
* present twice in both strings while the similarity between "tim" and "mit"
|
|
|
|
* is 0.
|
|
|
|
*/
|
|
|
|
static int fingerprint_similarity(struct fingerprint *a, struct fingerprint *b)
|
|
|
|
{
|
|
|
|
int intersection = 0;
|
|
|
|
struct hashmap_iter iter;
|
|
|
|
const struct fingerprint_entry *entry_a, *entry_b;
|
|
|
|
|
2019-10-07 01:30:38 +02:00
|
|
|
hashmap_for_each_entry(&b->map, &iter, entry_b,
|
|
|
|
entry /* member name */) {
|
2019-10-07 01:30:42 +02:00
|
|
|
entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);
|
2019-10-07 01:30:36 +02:00
|
|
|
if (entry_a) {
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
intersection += entry_a->count < entry_b->count ?
|
|
|
|
entry_a->count : entry_b->count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return intersection;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Subtracts byte-pair elements in B from A, modifying A in place.
|
|
|
|
*/
|
|
|
|
static void fingerprint_subtract(struct fingerprint *a, struct fingerprint *b)
|
|
|
|
{
|
|
|
|
struct hashmap_iter iter;
|
|
|
|
struct fingerprint_entry *entry_a;
|
|
|
|
const struct fingerprint_entry *entry_b;
|
|
|
|
|
|
|
|
hashmap_iter_init(&b->map, &iter);
|
|
|
|
|
2019-10-07 01:30:38 +02:00
|
|
|
hashmap_for_each_entry(&b->map, &iter, entry_b,
|
|
|
|
entry /* member name */) {
|
2019-10-07 01:30:42 +02:00
|
|
|
entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);
|
2019-10-07 01:30:36 +02:00
|
|
|
if (entry_a) {
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
if (entry_a->count <= entry_b->count)
|
2019-10-07 01:30:31 +02:00
|
|
|
hashmap_remove(&a->map, &entry_b->entry, NULL);
|
blame: add a fingerprint heuristic to match ignored lines
This algorithm will replace the heuristic used to identify lines from
ignored commits with one that finds likely candidate lines in the
parent's version of the file. The actual replacement occurs in an
upcoming commit.
The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent. The new function uses a
fingerprinting algorithm to detect similarity between lines.
The new heuristic is designed to accurately match changes made
mechanically by formatting tools such as clang-format and clang-tidy.
These tools make changes such as breaking up lines to fit within a
character limit or changing identifiers to fit with a naming convention.
The heuristic is not intended to match more extensive refactoring
changes and may give misleading results in such cases.
In most cases formatting tools preserve line ordering, so the heuristic
is optimised for such cases. (Some types of changes do reorder lines
e.g. sorting keep the line content identical, the git blame -M option
can already be used to address this). The reason that it is advantageous
to rely on ordering is due to source code repeating the same character
sequences often e.g. declaring an identifier on one line and using that
identifier on several subsequent lines. This means that lines can look
very similar to each other which presents a problem when doing fuzzy
matching. Relying on ordering gives us extra clues to point towards the
true match.
The heuristic operates on a single diff chunk change at a time. It
creates a “fingerprint” for each line on each side of the change.
Fingerprints are described in detail in the comment for `struct
fingerprint`, but essentially are a multiset of the character pairs in a
line. The heuristic first identifies the line in the target entry whose
fingerprint is most clearly matched to a line fingerprint in the parent
entry. Where fingerprints match identically, the position of the lines
is used as a tie-break. The heuristic locks in the best match, and
subtracts the fingerprint of the line in the target entry from the
fingerprint of the line in the parent entry to prevent other lines being
matched on the same parts of that line. It then repeats the process
recursively on the section of the chunk before the match, and then the
section of the chunk after the match.
Here's an example of the difference the fingerprinting makes. Consider
a file with two commits:
commit-a 1) void func_1(void *x, void *y);
commit-b 2) void func_2(void *x, void *y);
After a commit 'X', we have:
commit-X 1) void func_1(void *x,
commit-X 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
When we blame-ignored with the old algorithm, we get:
commit-a 1) void func_1(void *x,
commit-b 2) void *y);
commit-X 3) void func_2(void *x,
commit-X 4) void *y);
Where commit-b is blamed for 2 instead of 3. With the fingerprint
algorithm, we get:
commit-a 1) void func_1(void *x,
commit-a 2) void *y);
commit-b 3) void func_2(void *x,
commit-b 4) void *y);
Note line 2 could be matched with either commit-a or commit-b as it is
equally similar to both lines, but is matched with commit-a because its
position as a fraction of the new line range is more similar to commit-a
as a fraction of the old line range. Line 4 is also equally similar to
both lines, but as it appears after line 3 which will be matched first
it cannot be matched with an earlier line.
For many more examples, see t/t8014-blame-ignore-fuzzy.sh which contains
example parent and target files and the line numbers in the parent that
must be matched.
Signed-off-by: Michael Platings <michael@platin.gs>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-20 18:38:18 +02:00
|
|
|
else
|
|
|
|
entry_a->count -= entry_b->count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Calculate fingerprints for a series of lines.
|
|
|
|
* Puts the fingerprints in the fingerprints array, which must have been
|
|
|
|
* preallocated to allow storing line_count elements.
|
|
|
|
*/
|
|
|
|
static void get_line_fingerprints(struct fingerprint *fingerprints,
|
|
|
|
const char *content, const int *line_starts,
|
|
|
|
long first_line, long line_count)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
const char *linestart, *lineend;
|
|
|
|
|
|
|
|
line_starts += first_line;
|
|
|
|
for (i = 0; i < line_count; ++i) {
|
|
|
|
linestart = content + line_starts[i];
|
|
|
|
lineend = content + line_starts[i + 1];
|
|
|
|
get_fingerprint(fingerprints + i, linestart, lineend);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_line_fingerprints(struct fingerprint *fingerprints,
|
|
|
|
int nr_fingerprints)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_fingerprints; i++)
|
|
|
|
free_fingerprint(&fingerprints[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This contains the data necessary to linearly map a line number in one half
|
|
|
|
* of a diff chunk to the line in the other half of the diff chunk that is
|
|
|
|
* closest in terms of its position as a fraction of the length of the chunk.
|
|
|
|
*/
|
|
|
|
struct line_number_mapping {
|
|
|
|
int destination_start, destination_length,
|
|
|
|
source_start, source_length;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Given a line number in one range, offset and scale it to map it onto the
|
|
|
|
* other range.
|
|
|
|
* Essentially this mapping is a simple linear equation but the calculation is
|
|
|
|
* more complicated to allow performing it with integer operations.
|
|
|
|
* Another complication is that if a line could map onto many lines in the
|
|
|
|
* destination range then we want to choose the line at the center of those
|
|
|
|
* possibilities.
|
|
|
|
* Example: if the chunk is 2 lines long in A and 10 lines long in B then the
|
|
|
|
* first 5 lines in B will map onto the first line in the A chunk, while the
|
|
|
|
* last 5 lines will all map onto the second line in the A chunk.
|
|
|
|
* Example: if the chunk is 10 lines long in A and 2 lines long in B then line
|
|
|
|
* 0 in B will map onto line 2 in A, and line 1 in B will map onto line 7 in A.
|
|
|
|
*/
|
|
|
|
static int map_line_number(int line_number,
|
|
|
|
const struct line_number_mapping *mapping)
|
|
|
|
{
|
|
|
|
return ((line_number - mapping->source_start) * 2 + 1) *
|
|
|
|
mapping->destination_length /
|
|
|
|
(mapping->source_length * 2) +
|
|
|
|
mapping->destination_start;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get a pointer to the element storing the similarity between a line in A
|
|
|
|
* and a line in B.
|
|
|
|
*
|
|
|
|
* The similarities are stored in a 2-dimensional array. Each "row" in the
|
|
|
|
* array contains the similarities for a line in B. The similarities stored in
|
|
|
|
* a row are the similarities between the line in B and the nearby lines in A.
|
|
|
|
* To keep the length of each row the same, it is padded out with values of -1
|
|
|
|
* where the search range extends beyond the lines in A.
|
|
|
|
* For example, if max_search_distance_a is 2 and the two sides of a diff chunk
|
|
|
|
* look like this:
|
|
|
|
* a | m
|
|
|
|
* b | n
|
|
|
|
* c | o
|
|
|
|
* d | p
|
|
|
|
* e | q
|
|
|
|
* Then the similarity array will contain:
|
|
|
|
* [-1, -1, am, bm, cm,
|
|
|
|
* -1, an, bn, cn, dn,
|
|
|
|
* ao, bo, co, do, eo,
|
|
|
|
* bp, cp, dp, ep, -1,
|
|
|
|
* cq, dq, eq, -1, -1]
|
|
|
|
* Where similarities are denoted either by -1 for invalid, or the
|
|
|
|
* concatenation of the two lines in the diff being compared.
|
|
|
|
*
|
|
|
|
* \param similarities array of similarities between lines in A and B
|
|
|
|
* \param line_a the index of the line in A, in the same frame of reference as
|
|
|
|
* closest_line_a.
|
|
|
|
* \param local_line_b the index of the line in B, relative to the first line
|
|
|
|
* in B that similarities represents.
|
|
|
|
* \param closest_line_a the index of the line in A that is deemed to be
|
|
|
|
* closest to local_line_b. This must be in the same
|
|
|
|
* frame of reference as line_a. This value defines
|
|
|
|
* where similarities is centered for the line in B.
|
|
|
|
* \param max_search_distance_a maximum distance in lines from the closest line
|
|
|
|
* in A for other lines in A for which
|
|
|
|
* similarities may be calculated.
|
|
|
|
*/
|
|
|
|
static int *get_similarity(int *similarities,
|
|
|
|
int line_a, int local_line_b,
|
|
|
|
int closest_line_a, int max_search_distance_a)
|
|
|
|
{
|
|
|
|
assert(abs(line_a - closest_line_a) <=
|
|
|
|
max_search_distance_a);
|
|
|
|
return similarities + line_a - closest_line_a +
|
|
|
|
max_search_distance_a +
|
|
|
|
local_line_b * (max_search_distance_a * 2 + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define CERTAIN_NOTHING_MATCHES -2
|
|
|
|
#define CERTAINTY_NOT_CALCULATED -1
|
|
|
|
|
|
|
|
/* Given a line in B, first calculate its similarities with nearby lines in A
|
|
|
|
* if not already calculated, then identify the most similar and second most
|
|
|
|
* similar lines. The "certainty" is calculated based on those two
|
|
|
|
* similarities.
|
|
|
|
*
|
|
|
|
* \param start_a the index of the first line of the chunk in A
|
|
|
|
* \param length_a the length in lines of the chunk in A
|
|
|
|
* \param local_line_b the index of the line in B, relative to the first line
|
|
|
|
* in the chunk.
|
|
|
|
* \param fingerprints_a array of fingerprints for the chunk in A
|
|
|
|
* \param fingerprints_b array of fingerprints for the chunk in B
|
|
|
|
* \param similarities 2-dimensional array of similarities between lines in A
|
|
|
|
* and B. See get_similarity() for more details.
|
|
|
|
* \param certainties array of values indicating how strongly a line in B is
|
|
|
|
* matched with some line in A.
|
|
|
|
* \param second_best_result array of absolute indices in A for the second
|
|
|
|
* closest match of a line in B.
|
|
|
|
* \param result array of absolute indices in A for the closest match of a line
|
|
|
|
* in B.
|
|
|
|
* \param max_search_distance_a maximum distance in lines from the closest line
|
|
|
|
* in A for other lines in A for which
|
|
|
|
* similarities may be calculated.
|
|
|
|
* \param map_line_number_in_b_to_a parameter to map_line_number().
|
|
|
|
*/
|
|
|
|
static void find_best_line_matches(
|
|
|
|
int start_a,
|
|
|
|
int length_a,
|
|
|
|
int start_b,
|
|
|
|
int local_line_b,
|
|
|
|
struct fingerprint *fingerprints_a,
|
|
|
|
struct fingerprint *fingerprints_b,
|
|
|
|
int *similarities,
|
|
|
|
int *certainties,
|
|
|
|
int *second_best_result,
|
|
|
|
int *result,
|
|
|
|
const int max_search_distance_a,
|
|
|
|
const struct line_number_mapping *map_line_number_in_b_to_a)
|
|
|
|
{
|
|
|
|
|
|
|
|
int i, search_start, search_end, closest_local_line_a, *similarity,
|
|
|
|
best_similarity = 0, second_best_similarity = 0,
|
|
|
|
best_similarity_index = 0, second_best_similarity_index = 0;
|
|
|
|
|
|
|
|
/* certainty has already been calculated so no need to redo the work */
|
|
|
|
if (certainties[local_line_b] != CERTAINTY_NOT_CALCULATED)
|
|
|
|
return;
|
|
|
|
|
|
|
|
closest_local_line_a = map_line_number(
|
|
|
|
local_line_b + start_b, map_line_number_in_b_to_a) - start_a;
|
|
|
|
|
|
|
|
search_start = closest_local_line_a - max_search_distance_a;
|
|
|
|
if (search_start < 0)
|
|
|
|
search_start = 0;
|
|
|
|
|
|
|
|
search_end = closest_local_line_a + max_search_distance_a + 1;
|
|
|
|
if (search_end > length_a)
|
|
|
|
search_end = length_a;
|
|
|
|
|
|
|
|
for (i = search_start; i < search_end; ++i) {
|
|
|
|
similarity = get_similarity(similarities,
|
|
|
|
i, local_line_b,
|
|
|
|
closest_local_line_a,
|
|
|
|
max_search_distance_a);
|
|
|
|
if (*similarity == -1) {
|
|
|
|
/* This value will never exceed 10 but assert just in
|
|
|
|
* case
|
|
|
|
*/
|
|
|
|
assert(abs(i - closest_local_line_a) < 1000);
|
|
|
|
/* scale the similarity by (1000 - distance from
|
|
|
|
* closest line) to act as a tie break between lines
|
|
|
|
* that otherwise are equally similar.
|
|
|
|
*/
|
|
|
|
*similarity = fingerprint_similarity(
|
|
|
|
fingerprints_b + local_line_b,
|
|
|
|
fingerprints_a + i) *
|
|
|
|
(1000 - abs(i - closest_local_line_a));
|
|
|
|
}
|
|
|
|
if (*similarity > best_similarity) {
|
|
|
|
second_best_similarity = best_similarity;
|
|
|
|
second_best_similarity_index = best_similarity_index;
|
|
|
|
best_similarity = *similarity;
|
|
|
|
best_similarity_index = i;
|
|
|
|
} else if (*similarity > second_best_similarity) {
|
|
|
|
second_best_similarity = *similarity;
|
|
|
|
second_best_similarity_index = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (best_similarity == 0) {
|
|
|
|
/* this line definitely doesn't match with anything. Mark it
|
|
|
|
* with this special value so it doesn't get invalidated and
|
|
|
|
* won't be recalculated.
|
|
|
|
*/
|
|
|
|
certainties[local_line_b] = CERTAIN_NOTHING_MATCHES;
|
|
|
|
result[local_line_b] = -1;
|
|
|
|
} else {
|
|
|
|
/* Calculate the certainty with which this line matches.
|
|
|
|
* If the line matches well with two lines then that reduces
|
|
|
|
* the certainty. However we still want to prioritise matching
|
|
|
|
* a line that matches very well with two lines over matching a
|
|
|
|
* line that matches poorly with one line, hence doubling
|
|
|
|
* best_similarity.
|
|
|
|
* This means that if we have
|
|
|
|
* line X that matches only one line with a score of 3,
|
|
|
|
* line Y that matches two lines equally with a score of 5,
|
|
|
|
* and line Z that matches only one line with a score or 2,
|
|
|
|
* then the lines in order of certainty are X, Y, Z.
|
|
|
|
*/
|
|
|
|
certainties[local_line_b] = best_similarity * 2 -
|
|
|
|
second_best_similarity;
|
|
|
|
|
|
|
|
/* We keep both the best and second best results to allow us to
|
|
|
|
* check at a later stage of the matching process whether the
|
|
|
|
* result needs to be invalidated.
|
|
|
|
*/
|
|
|
|
result[local_line_b] = start_a + best_similarity_index;
|
|
|
|
second_best_result[local_line_b] =
|
|
|
|
start_a + second_best_similarity_index;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This finds the line that we can match with the most confidence, and
|
|
|
|
* uses it as a partition. It then calls itself on the lines on either side of
|
|
|
|
* that partition. In this way we avoid lines appearing out of order, and
|
|
|
|
* retain a sensible line ordering.
|
|
|
|
* \param start_a index of the first line in A with which lines in B may be
|
|
|
|
* compared.
|
|
|
|
* \param start_b index of the first line in B for which matching should be
|
|
|
|
* done.
|
|
|
|
* \param length_a number of lines in A with which lines in B may be compared.
|
|
|
|
* \param length_b number of lines in B for which matching should be done.
|
|
|
|
* \param fingerprints_a mutable array of fingerprints in A. The first element
|
|
|
|
* corresponds to the line at start_a.
|
|
|
|
* \param fingerprints_b array of fingerprints in B. The first element
|
|
|
|
* corresponds to the line at start_b.
|
|
|
|
* \param similarities 2-dimensional array of similarities between lines in A
|
|
|
|
* and B. See get_similarity() for more details.
|
|
|
|
* \param certainties array of values indicating how strongly a line in B is
|
|
|
|
* matched with some line in A.
|
|
|
|
* \param second_best_result array of absolute indices in A for the second
|
|
|
|
* closest match of a line in B.
|
|
|
|
* \param result array of absolute indices in A for the closest match of a line
|
|
|
|
* in B.
|
|
|
|
* \param max_search_distance_a maximum distance in lines from the closest line
|
|
|
|
* in A for other lines in A for which
|
|
|
|
* similarities may be calculated.
|
|
|
|
* \param max_search_distance_b an upper bound on the greatest possible
|
|
|
|
* distance between lines in B such that they will
|
|
|
|
* both be compared with the same line in A
|
|
|
|
* according to max_search_distance_a.
|
|
|
|
* \param map_line_number_in_b_to_a parameter to map_line_number().
|
|
|
|
*/
|
|
|
|
static void fuzzy_find_matching_lines_recurse(
|
|
|
|
int start_a, int start_b,
|
|
|
|
int length_a, int length_b,
|
|
|
|
struct fingerprint *fingerprints_a,
|
|
|
|
struct fingerprint *fingerprints_b,
|
|
|
|
int *similarities,
|
|
|
|
int *certainties,
|
|
|
|
int *second_best_result,
|
|
|
|
int *result,
|
|
|
|
int max_search_distance_a,
|
|
|
|
int max_search_distance_b,
|
|
|
|
const struct line_number_mapping *map_line_number_in_b_to_a)
|
|
|
|
{
|
|
|
|
int i, invalidate_min, invalidate_max, offset_b,
|
|
|
|
second_half_start_a, second_half_start_b,
|
|
|
|
second_half_length_a, second_half_length_b,
|
|
|
|
most_certain_line_a, most_certain_local_line_b = -1,
|
|
|
|
most_certain_line_certainty = -1,
|
|
|
|
closest_local_line_a;
|
|
|
|
|
|
|
|
for (i = 0; i < length_b; ++i) {
|
|
|
|
find_best_line_matches(start_a,
|
|
|
|
length_a,
|
|