aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAleksandr Nogikh <nogikh@google.com>2026-01-06 19:23:39 +0100
committerAleksandr Nogikh <nogikh@google.com>2026-01-09 14:28:59 +0000
commit8d6fe1046d948f9b31d21d260a8b1d1cb4145930 (patch)
tree20aa2347caa6fd73c74c1343bb3b453744b16b78
parentab680254a62d631fbad7921aa157e4cc37c1dbc6 (diff)
pkg/vcs: find base commit by blob sha hashes
Given a git diff, determine the latest commit where the modified files still have the exact sha hashes they had at the moment the git patch was created.
-rw-r--r--pkg/vcs/git.go180
-rw-r--r--pkg/vcs/git_test.go84
2 files changed, 264 insertions, 0 deletions
diff --git a/pkg/vcs/git.go b/pkg/vcs/git.go
index a2975c034..6bb826edb 100644
--- a/pkg/vcs/git.go
+++ b/pkg/vcs/git.go
@@ -747,3 +747,183 @@ func (git Git) fetchCommits(since, base, user, domain string, greps []string, fi
}
return commits, s.Err()
}
+
+type BaseCommit struct {
+ *Commit
+ Branches []string
+}
+
+// BaseForDiff selects the most recent commit that could have been the base commit
+// for the specified git patch.
+func (git Git) BaseForDiff(diff []byte, tracer debugtracer.DebugTracer) (*BaseCommit, error) {
+ // We can't just query git log with --find-object=HASH because that will only return
+ // the revisions where the hashed content was introduced or removed, while what we actually
+ // want is the latest revision(s) where the content modified in the diff is still in place.
+
+ // So we build a set of potential commits of interest:
+ // 1) Tips of the branches.
+ // 2) Parents of the commit that in any way mention the modified blob hashes.
+ // Then these commits are verified.
+
+ args := []string{
+ "log",
+ "--all",
+ "--no-renames",
+ // Note that we cannot accelerate it by specifying "--since"
+ "-n", "100",
+ `--format=%P`,
+ }
+ var fileNames []string
+ nameToHash := map[string]string{}
+ for _, file := range ParseGitDiff(diff) {
+ if strings.Trim(file.LeftHash, "0") == "" {
+ // Newly created file are not of any help here.
+ continue
+ }
+ if ok, err := git.verifyHash(file.LeftHash); err != nil {
+ return nil, fmt.Errorf("hash verification failed: %w", err)
+ } else if !ok {
+ // The object is not known in this repository.
+ // Ignore it, or otherwise the command will fail.
+ continue
+ }
+ if _, ok := nameToHash[file.Name]; !ok {
+ // If the diff is actually a concatenation of several diffs, we only
+ // want to remember the first left side hash for each file.
+ fileNames = append(fileNames, file.Name)
+ nameToHash[file.Name] = file.LeftHash
+ }
+ args = append(args, "--find-object="+file.LeftHash)
+ }
+ tracer.Log("extracted %d left blob hashes", len(nameToHash))
+ output, err := git.Run(args...)
+ if err != nil {
+ return nil, err
+ }
+ commitBranches := map[string]map[string]struct{}{}
+ record := func(commit string, branch string) {
+ if commitBranches[commit] == nil {
+ commitBranches[commit] = map[string]struct{}{}
+ }
+ commitBranches[commit][branch] = struct{}{}
+ }
+
+ s := bufio.NewScanner(bytes.NewReader(output))
+ for s.Scan() {
+ // TODO: we can further reduce the search space by adding "--raw" to args
+ // and only considering the commits that introduce the blobs from the diff.
+ firstParent, _, _ := strings.Cut(s.Text(), " ")
+ if firstParent == "" {
+ continue
+ }
+ // Only focus on branches that are still alive.
+ const cutOffDays = 60
+ list, err := git.branchesThatContain(firstParent, time.Now().Add(-time.Hour*24*cutOffDays))
+ if err != nil {
+ return nil, fmt.Errorf("failed to query branches: %w", err)
+ }
+ for _, info := range list {
+ record(firstParent, info.Branch)
+ record(info.Commit, info.Branch)
+ }
+ }
+ var ret *BaseCommit
+ for commit, branches := range commitBranches {
+ tracer.Log("considering %q [%q]", commit, branches)
+ fileHashes, err := git.fileHashes(commit, fileNames)
+ if err != nil {
+ return nil, fmt.Errorf("failed to extract hashes for %s: %w", commit, err)
+ }
+ var noMatch []string
+ for _, name := range fileNames {
+ if !strings.HasPrefix(fileHashes[name], nameToHash[name]) {
+ noMatch = append(noMatch, name)
+ }
+ }
+ if len(noMatch) != 0 {
+ tracer.Log("hashes don't match for %q", noMatch)
+ continue
+ }
+ var branchList []string
+ for branch := range branches {
+ branchList = append(branchList, branch)
+ }
+ sort.Strings(branchList)
+ info, err := git.Commit(commit)
+ if err != nil {
+ return nil, fmt.Errorf("failed to extract commit info: %w", err)
+ }
+ tracer.Log("commit date is %v", info.CommitDate)
+ // Select the most recent commit.
+ if ret == nil || ret.CommitDate.Before(info.CommitDate) {
+ ret = &BaseCommit{Commit: info, Branches: branchList}
+ }
+ }
+ return ret, nil
+}
+
+// fileHashes returns the blob SHA hashes for a particular list of files on a particular commit.
+func (git Git) fileHashes(commit string, files []string) (map[string]string, error) {
+ output, err := git.Run(append([]string{"ls-tree", commit}, files...)...)
+ if err != nil {
+ return nil, err
+ }
+ ret := map[string]string{}
+ s := bufio.NewScanner(bytes.NewReader(output))
+ for s.Scan() {
+ line := s.Text()
+ fields := strings.Fields(line)
+ if len(fields) != 4 {
+ return nil, fmt.Errorf("invalid output: %q", line)
+ }
+ ret[fields[3]] = fields[2]
+ }
+ return ret, nil
+}
+
+type branchCommit struct {
+ Branch string
+ Commit string
+}
+
+func (git Git) branchesThatContain(commit string, since time.Time) ([]branchCommit, error) {
+ output, err := git.Run(
+ "branch", "-a",
+ "--contains", commit,
+ `--format=%(committerdate);%(objectname);%(refname:short)`,
+ )
+ if err != nil {
+ return nil, err
+ }
+ var ret []branchCommit
+ s := bufio.NewScanner(bytes.NewReader(output))
+ for s.Scan() {
+ dateString, branchInfo, _ := strings.Cut(s.Text(), ";")
+ commit, branch, _ := strings.Cut(branchInfo, ";")
+ date, err := time.Parse(gitDateFormat, dateString)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse git date: %w\n%q", err, dateString)
+ }
+ if date.Before(since) {
+ continue
+ }
+ ret = append(ret, branchCommit{Branch: branch, Commit: commit})
+ }
+ return ret, nil
+}
+
+func (git Git) verifyHash(hash string) (bool, error) {
+ _, err := git.Run("rev-parse", "--quiet", "--verify", hash)
+ if err != nil {
+ var verboseErr *osutil.VerboseError
+ if errors.As(err, &verboseErr) && verboseErr.ExitCode == 1 {
+ return false, nil
+ }
+ return false, err
+ }
+ return true, nil
+}
+
+func (git Git) Diff(commitA, commitB string) ([]byte, error) {
+ return git.Run("diff", commitA+".."+commitB)
+}
diff --git a/pkg/vcs/git_test.go b/pkg/vcs/git_test.go
index 5050bcdc0..542b572ac 100644
--- a/pkg/vcs/git_test.go
+++ b/pkg/vcs/git_test.go
@@ -10,7 +10,9 @@ import (
"time"
"github.com/google/go-cmp/cmp"
+ "github.com/google/syzkaller/pkg/debugtracer"
"github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
)
func TestGitParseCommit(t *testing.T) {
@@ -484,3 +486,85 @@ index f70f10e..0000000
},
})
}
+
+func TestGitFileHashes(t *testing.T) {
+ repo := MakeTestRepo(t, t.TempDir())
+ commit1 := repo.commitChangeset("first commit", writeFile{"object.txt", "some text"})
+ commit2 := repo.commitChangeset("second commit", writeFile{"object2.txt", "some text2"})
+
+ map1, err := repo.repo.fileHashes(commit1.Hash, []string{"object.txt", "object2.txt"})
+ require.NoError(t, err)
+ assert.NotEmpty(t, map1["object.txt"])
+
+ map2, err := repo.repo.fileHashes(commit2.Hash, []string{"object.txt", "object2.txt"})
+ require.NoError(t, err)
+ assert.NotEmpty(t, map2["object.txt"])
+ assert.NotEmpty(t, map2["object2.txt"])
+}
+
+func TestBaseForDiff(t *testing.T) {
+ repo := MakeTestRepo(t, t.TempDir())
+ repo.commitChangeset("first commit",
+ writeFile{"a.txt", "content of a.txt"},
+ writeFile{"b.txt", "content of b.txt"},
+ )
+ commit2 := repo.commitChangeset("second commit",
+ writeFile{"c.txt", "content of c.txt"},
+ writeFile{"d.txt", "content of d.txt"},
+ )
+ // Create a diff.
+ commit3 := repo.commitChangeset("third commit",
+ writeFile{"a.txt", "update a.txt"},
+ )
+ diff, err := repo.repo.Diff(commit2.Hash, commit3.Hash)
+ require.NoError(t, err)
+ t.Run("conflicting", func(t *testing.T) {
+ _, err := repo.repo.SwitchCommit(commit2.Hash)
+ require.NoError(t, err)
+ // Create a different change on top of commit2.
+ repo.Git("checkout", "-b", "branch-a")
+ time.Sleep(time.Second)
+ repo.commitChangeset("patch a.txt",
+ writeFile{"a.txt", "another change to a.txt"},
+ )
+ // Yet the patch could only be applied to commit2
+ base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t})
+ require.NoError(t, err)
+ require.NotNil(t, base)
+ assert.Equal(t, []string{"branch-a", "master"}, base.Branches)
+ assert.Equal(t, commit2.Hash, base.Hash)
+ })
+ t.Run("choose latest", func(t *testing.T) {
+ _, err := repo.repo.SwitchCommit(commit2.Hash)
+ require.NoError(t, err)
+ // Wait a second and add one more commit.
+ // (Otherwise the test might be flaky).
+ time.Sleep(time.Second)
+ repo.Git("checkout", "-b", "branch-b")
+ commit4 := repo.commitChangeset("unrelated commit",
+ writeFile{"new.txt", "create new file"},
+ )
+ // Since the commit did not touch a.txt, it's the expected one.
+ base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t})
+ require.NoError(t, err)
+ require.NotNil(t, base)
+ assert.Equal(t, []string{"branch-b"}, base.Branches)
+ assert.Equal(t, commit4.Hash, base.Hash)
+ })
+ t.Run("ignore unknown objects", func(t *testing.T) {
+ // It's okay if the diff contains unknown hashes.
+ diff2 := `
+diff --git a/b.txt b/b.txt
+deleted file mode 100644
+index f70f10e..0000000
+--- a/b.txt
++++ /dev/null
+@@ -1 +0,0 @@
+-A`
+ twoDiffs := append(append([]byte{}, diff...), diff2...)
+ base, err := repo.repo.BaseForDiff(twoDiffs, &debugtracer.TestTracer{T: t})
+ require.NoError(t, err)
+ require.NotNil(t, base)
+ assert.Equal(t, []string{"branch-b"}, base.Branches)
+ })
+}