From 8d6fe1046d948f9b31d21d260a8b1d1cb4145930 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Tue, 6 Jan 2026 19:23:39 +0100 Subject: pkg/vcs: find base commit by blob sha hashes Given a git diff, determine the latest commit where the modified files still have the exact sha hashes they had at the moment the git patch was created. --- pkg/vcs/git.go | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++++ pkg/vcs/git_test.go | 84 ++++++++++++++++++++++++ 2 files changed, 264 insertions(+) (limited to 'pkg') diff --git a/pkg/vcs/git.go b/pkg/vcs/git.go index a2975c034..6bb826edb 100644 --- a/pkg/vcs/git.go +++ b/pkg/vcs/git.go @@ -747,3 +747,183 @@ func (git Git) fetchCommits(since, base, user, domain string, greps []string, fi } return commits, s.Err() } + +type BaseCommit struct { + *Commit + Branches []string +} + +// BaseForDiff selects the most recent commit that could have been the base commit +// for the specified git patch. +func (git Git) BaseForDiff(diff []byte, tracer debugtracer.DebugTracer) (*BaseCommit, error) { + // We can't just query git log with --find-object=HASH because that will only return + // the revisions where the hashed content was introduced or removed, while what we actually + // want is the latest revision(s) where the content modified in the diff is still in place. + + // So we build a set of potential commits of interest: + // 1) Tips of the branches. + // 2) Parents of the commit that in any way mention the modified blob hashes. + // Then these commits are verified. + + args := []string{ + "log", + "--all", + "--no-renames", + // Note that we cannot accelerate it by specifying "--since" + "-n", "100", + `--format=%P`, + } + var fileNames []string + nameToHash := map[string]string{} + for _, file := range ParseGitDiff(diff) { + if strings.Trim(file.LeftHash, "0") == "" { + // Newly created file are not of any help here. + continue + } + if ok, err := git.verifyHash(file.LeftHash); err != nil { + return nil, fmt.Errorf("hash verification failed: %w", err) + } else if !ok { + // The object is not known in this repository. + // Ignore it, or otherwise the command will fail. + continue + } + if _, ok := nameToHash[file.Name]; !ok { + // If the diff is actually a concatenation of several diffs, we only + // want to remember the first left side hash for each file. + fileNames = append(fileNames, file.Name) + nameToHash[file.Name] = file.LeftHash + } + args = append(args, "--find-object="+file.LeftHash) + } + tracer.Log("extracted %d left blob hashes", len(nameToHash)) + output, err := git.Run(args...) + if err != nil { + return nil, err + } + commitBranches := map[string]map[string]struct{}{} + record := func(commit string, branch string) { + if commitBranches[commit] == nil { + commitBranches[commit] = map[string]struct{}{} + } + commitBranches[commit][branch] = struct{}{} + } + + s := bufio.NewScanner(bytes.NewReader(output)) + for s.Scan() { + // TODO: we can further reduce the search space by adding "--raw" to args + // and only considering the commits that introduce the blobs from the diff. + firstParent, _, _ := strings.Cut(s.Text(), " ") + if firstParent == "" { + continue + } + // Only focus on branches that are still alive. + const cutOffDays = 60 + list, err := git.branchesThatContain(firstParent, time.Now().Add(-time.Hour*24*cutOffDays)) + if err != nil { + return nil, fmt.Errorf("failed to query branches: %w", err) + } + for _, info := range list { + record(firstParent, info.Branch) + record(info.Commit, info.Branch) + } + } + var ret *BaseCommit + for commit, branches := range commitBranches { + tracer.Log("considering %q [%q]", commit, branches) + fileHashes, err := git.fileHashes(commit, fileNames) + if err != nil { + return nil, fmt.Errorf("failed to extract hashes for %s: %w", commit, err) + } + var noMatch []string + for _, name := range fileNames { + if !strings.HasPrefix(fileHashes[name], nameToHash[name]) { + noMatch = append(noMatch, name) + } + } + if len(noMatch) != 0 { + tracer.Log("hashes don't match for %q", noMatch) + continue + } + var branchList []string + for branch := range branches { + branchList = append(branchList, branch) + } + sort.Strings(branchList) + info, err := git.Commit(commit) + if err != nil { + return nil, fmt.Errorf("failed to extract commit info: %w", err) + } + tracer.Log("commit date is %v", info.CommitDate) + // Select the most recent commit. + if ret == nil || ret.CommitDate.Before(info.CommitDate) { + ret = &BaseCommit{Commit: info, Branches: branchList} + } + } + return ret, nil +} + +// fileHashes returns the blob SHA hashes for a particular list of files on a particular commit. +func (git Git) fileHashes(commit string, files []string) (map[string]string, error) { + output, err := git.Run(append([]string{"ls-tree", commit}, files...)...) + if err != nil { + return nil, err + } + ret := map[string]string{} + s := bufio.NewScanner(bytes.NewReader(output)) + for s.Scan() { + line := s.Text() + fields := strings.Fields(line) + if len(fields) != 4 { + return nil, fmt.Errorf("invalid output: %q", line) + } + ret[fields[3]] = fields[2] + } + return ret, nil +} + +type branchCommit struct { + Branch string + Commit string +} + +func (git Git) branchesThatContain(commit string, since time.Time) ([]branchCommit, error) { + output, err := git.Run( + "branch", "-a", + "--contains", commit, + `--format=%(committerdate);%(objectname);%(refname:short)`, + ) + if err != nil { + return nil, err + } + var ret []branchCommit + s := bufio.NewScanner(bytes.NewReader(output)) + for s.Scan() { + dateString, branchInfo, _ := strings.Cut(s.Text(), ";") + commit, branch, _ := strings.Cut(branchInfo, ";") + date, err := time.Parse(gitDateFormat, dateString) + if err != nil { + return nil, fmt.Errorf("failed to parse git date: %w\n%q", err, dateString) + } + if date.Before(since) { + continue + } + ret = append(ret, branchCommit{Branch: branch, Commit: commit}) + } + return ret, nil +} + +func (git Git) verifyHash(hash string) (bool, error) { + _, err := git.Run("rev-parse", "--quiet", "--verify", hash) + if err != nil { + var verboseErr *osutil.VerboseError + if errors.As(err, &verboseErr) && verboseErr.ExitCode == 1 { + return false, nil + } + return false, err + } + return true, nil +} + +func (git Git) Diff(commitA, commitB string) ([]byte, error) { + return git.Run("diff", commitA+".."+commitB) +} diff --git a/pkg/vcs/git_test.go b/pkg/vcs/git_test.go index 5050bcdc0..542b572ac 100644 --- a/pkg/vcs/git_test.go +++ b/pkg/vcs/git_test.go @@ -10,7 +10,9 @@ import ( "time" "github.com/google/go-cmp/cmp" + "github.com/google/syzkaller/pkg/debugtracer" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestGitParseCommit(t *testing.T) { @@ -484,3 +486,85 @@ index f70f10e..0000000 }, }) } + +func TestGitFileHashes(t *testing.T) { + repo := MakeTestRepo(t, t.TempDir()) + commit1 := repo.commitChangeset("first commit", writeFile{"object.txt", "some text"}) + commit2 := repo.commitChangeset("second commit", writeFile{"object2.txt", "some text2"}) + + map1, err := repo.repo.fileHashes(commit1.Hash, []string{"object.txt", "object2.txt"}) + require.NoError(t, err) + assert.NotEmpty(t, map1["object.txt"]) + + map2, err := repo.repo.fileHashes(commit2.Hash, []string{"object.txt", "object2.txt"}) + require.NoError(t, err) + assert.NotEmpty(t, map2["object.txt"]) + assert.NotEmpty(t, map2["object2.txt"]) +} + +func TestBaseForDiff(t *testing.T) { + repo := MakeTestRepo(t, t.TempDir()) + repo.commitChangeset("first commit", + writeFile{"a.txt", "content of a.txt"}, + writeFile{"b.txt", "content of b.txt"}, + ) + commit2 := repo.commitChangeset("second commit", + writeFile{"c.txt", "content of c.txt"}, + writeFile{"d.txt", "content of d.txt"}, + ) + // Create a diff. + commit3 := repo.commitChangeset("third commit", + writeFile{"a.txt", "update a.txt"}, + ) + diff, err := repo.repo.Diff(commit2.Hash, commit3.Hash) + require.NoError(t, err) + t.Run("conflicting", func(t *testing.T) { + _, err := repo.repo.SwitchCommit(commit2.Hash) + require.NoError(t, err) + // Create a different change on top of commit2. + repo.Git("checkout", "-b", "branch-a") + time.Sleep(time.Second) + repo.commitChangeset("patch a.txt", + writeFile{"a.txt", "another change to a.txt"}, + ) + // Yet the patch could only be applied to commit2 + base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t}) + require.NoError(t, err) + require.NotNil(t, base) + assert.Equal(t, []string{"branch-a", "master"}, base.Branches) + assert.Equal(t, commit2.Hash, base.Hash) + }) + t.Run("choose latest", func(t *testing.T) { + _, err := repo.repo.SwitchCommit(commit2.Hash) + require.NoError(t, err) + // Wait a second and add one more commit. + // (Otherwise the test might be flaky). + time.Sleep(time.Second) + repo.Git("checkout", "-b", "branch-b") + commit4 := repo.commitChangeset("unrelated commit", + writeFile{"new.txt", "create new file"}, + ) + // Since the commit did not touch a.txt, it's the expected one. + base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t}) + require.NoError(t, err) + require.NotNil(t, base) + assert.Equal(t, []string{"branch-b"}, base.Branches) + assert.Equal(t, commit4.Hash, base.Hash) + }) + t.Run("ignore unknown objects", func(t *testing.T) { + // It's okay if the diff contains unknown hashes. + diff2 := ` +diff --git a/b.txt b/b.txt +deleted file mode 100644 +index f70f10e..0000000 +--- a/b.txt ++++ /dev/null +@@ -1 +0,0 @@ +-A` + twoDiffs := append(append([]byte{}, diff...), diff2...) + base, err := repo.repo.BaseForDiff(twoDiffs, &debugtracer.TestTracer{T: t}) + require.NoError(t, err) + require.NotNil(t, base) + assert.Equal(t, []string{"branch-b"}, base.Branches) + }) +} -- cgit mrf-deployment