1 Star 0 Fork 34

zqq61/git-repo-clean

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
parser.go 21.29 KB
一键复制 编辑 原始数据 按行查看 历史
Li Linchao 提交于 2022-01-19 11:20 . fix some go-staticcheck issues

package main
import (
"bytes"
"fmt"
"io"
"os"
"regexp"
"strconv"
"strings"
mapset "github.com/deckarep/golang-set"
)
const (
idx_re = `(\d+)\n$` // mark :1, from :1, merge :1
oid_re = `([0-9a-f]{40})\n$` // original-oid 401fb905f1abf1d35331d0cddc8556ba23c1a212
user_re = `(.*?) <(.*?)> (.*)\n$` // author|commiter|tagger Li Linchao <lilinchao@oschina.cn> 1633964331 +0800
ref_re = `(.*)\n$` // commit|reset|tag refs/tags/v1.0.0
)
var (
HASH_ID = make(map[string]int32)
SKIPPED_COMMITS = mapset.NewSet()
Lasted_commit = make(map[string]int32)
Lasted_orig_commit = make(map[string]int32)
Branch_changed = mapset.NewSet() // record branches that has been changed
Files_changed = mapset.NewSet() // record files for LFS
)
type origParents []int32
type hasFilechange bool
type Helper_info struct {
orig_parents origParents
has_filechange hasFilechange
}
func Match(pattern string, str string) []string {
re := regexp.MustCompile(pattern)
return re.FindStringSubmatch(str)
}
/*
blob
mark :1
original-oid 401fb905f1abf1d35331d0cddc8556ba23c1a212
data 9
"file a"
*/
type Blob struct {
ele *GitElementsWithID // contain: id, old_id, types, dumped
original_oid string // 40 bytes
data_size int64 // blob size maybe very large
data []byte // raw data block
sha256 string // for lfs objects
}
func NewBlob(size_ int64, data_ []byte, hash_id_, sha256_ string) Blob {
var ele = NewGitElementsWithID()
ele.base.types = "blob"
return Blob{
ele: &ele,
original_oid: hash_id_,
data_size: size_,
data: data_,
sha256: sha256_,
}
}
func (blob Blob) dump(writer io.WriteCloser) {
blob.ele.base.dumped = true
HASH_ID[blob.original_oid] = blob.ele.id
ID_HASH[blob.ele.id] = blob.original_oid
mark_line := fmt.Sprintf("mark :%d\n", blob.ele.id)
oid_line := fmt.Sprintf("original-oid %s\n", blob.original_oid)
data_line := fmt.Sprintf("data %d\n%s\n", blob.data_size, blob.data)
writer.Write([]byte("blob\n"))
writer.Write([]byte(mark_line))
writer.Write([]byte(oid_line))
writer.Write([]byte(data_line))
}
/*
the filechange format is: "type mode id filepath\n"
M 100644 :18 files/1.c
M 100644 :18 files/2.c
M 100644 :16 output
type specification:
M: modify
A: add
C: copy
D: delete
mode specification:
100644 or 644: normal, but non executable file
100755 or 755: normal, but executable file
120000: symlink
160000: gitlink
040000: *subdirectory
**NOTE**
when the mode is "040000", the id must be 40-byte HASH-1 value, but not short mark id
filechange can compose together :
D A
M 100644 :16 dir/B
this means rename a file and move it from another location
*/
type FileChange struct {
base *GitElements
changetype string
mode string
blob_id string
filepath string
branch string // record branch(ref) name this filechange belongs to
}
// **NOTE**
// when type is "M", mode and id must not nil, when type is "D", mode and id is nil
//
// filechange usually have multi-line
func NewFileChange(types_, mode_, id_, filepath_ string) FileChange {
var base = NewGitElement()
base.types = "filechange"
return FileChange{
base: &base,
changetype: types_,
mode: mode_,
blob_id: id_,
filepath: filepath_,
}
}
func (fc *FileChange) dump(writer io.WriteCloser) {
if fc.changetype == "M" && fc.blob_id == "0" {
return
}
fc.base.dumped = true
if fc.changetype == "M" {
if len(fc.blob_id) == 40 {
filechange_ := fmt.Sprintf("M %s %s %s\n", fc.mode, fc.blob_id, fc.filepath)
writer.Write([]byte(filechange_))
} else {
filechange_ := fmt.Sprintf("M %s :%s %s\n", fc.mode, fc.blob_id, fc.filepath)
writer.Write([]byte(filechange_))
}
} else if fc.changetype == "D" {
filechange_ := fmt.Sprintf("D %s\n", fc.filepath)
writer.Write([]byte(filechange_))
} else if fc.changetype == "R" {
// **NOTE** if we don't add '-M' in git-fast-export, then it will use 'M' and 'D' to represent 'R'
// filepath_ := fmt.Sprintf("R %s %s\n", fc.old-path, new-path)
// writer.Write([]byte(filepath_))
} else {
// unhandle filechange type
PrintLocalWithRedln("unsupported filechange type")
return
}
}
/*
commit refs/heads/4
mark :25
original-oid daca020f8360e0b2ea383e195b09b9c6a4a4979b
author Li Linchao <lilinchao@oschina.cn> 1634117087 +0800
committer Li Linchao <lilinchao@oschina.cn> 1634117087 +0800
data 39
Merge branches '5', '6' and '7' into 4 <---- merge three branches into one
from :20
merge :22 <---- one merge command
merge :24 <---- and another one
M 100644 :21 6.md
M 100644 :23 7.md
The author format is:<author_name> SP <author_email> SP <author_date> LF
and the commiter format is : <committer_name> SP <committer_email> SP <committer_date> LF
As for the format of "from", see:
https://git-scm.com/docs/git-fast-import#_from
**NOTE**
a). When a commit has no parent(usually the first commit), then there will be a reset command in the front of commit command.
c). When merge multi branches into one, a commit have multi parents, then there will be at least one merge commands.
c). When a commit has parent commit, then it has from or merge(or both), otherwise, have none of them.
*/
type Commit struct {
ele *GitElementsWithID // mark id
old_id int32 // previous mark id, given by GitElementsWithID
original_oid string
branch string
author string
commiter string
msg_size int32
message []byte // commit message
parents []int32 // from and merge. from maybe none, and merge maybe multi
filechanges []FileChange // multi-line
}
func NewCommit(original_oid_, branch_, author_, commiter_ string, size_ int32, msg_ []byte, parents_ []int32, filechanges_ []FileChange) Commit {
var ele = NewGitElementsWithID()
ele.base.types = "commit"
return Commit{
ele: &ele,
old_id: ele.id,
original_oid: original_oid_,
branch: branch_,
author: author_,
commiter: commiter_,
msg_size: size_,
message: msg_,
parents: parents_,
filechanges: filechanges_,
}
}
func (commit *Commit) dump(writer io.WriteCloser) {
commit.ele.base.dumped = true
HASH_ID[commit.original_oid] = commit.ele.id
ID_HASH[commit.ele.id] = commit.original_oid
commit_line := fmt.Sprintf("commit%s\n", commit.branch)
mark_line := fmt.Sprintf("mark :%d\n", commit.ele.id)
orig_id := fmt.Sprintf("original-oid %s\n", commit.original_oid)
writer.Write([]byte(commit_line))
writer.Write([]byte(mark_line))
writer.Write([]byte(orig_id))
if len(commit.author) != 0 {
author_line := commit.author
writer.Write([]byte(author_line))
}
if len(commit.commiter) != 0 {
commiter_line := commit.commiter
writer.Write([]byte(commiter_line))
}
size_line := fmt.Sprintf("data %d\n", commit.msg_size)
data_line := commit.message
writer.Write([]byte(size_line))
writer.Write([]byte(data_line))
if len(commit.parents) > 0 {
from_line := fmt.Sprintf("from :%d\n", commit.parents[0])
writer.Write([]byte(from_line))
}
if len(commit.parents) > 1 {
for _, merge := range commit.parents[1:] {
parent_line := fmt.Sprintf("merge :%d\n", merge)
writer.Write([]byte(parent_line))
}
}
// **NOTE** the filechanges here are multi-string line
for _, filechange := range commit.filechanges {
filechange.dump(writer)
}
writer.Write([]byte("\n"))
}
func (commit *Commit) first_parent() int32 {
if len(commit.parents) > 0 {
return commit.parents[0]
}
return 0
}
func (commit *Commit) skip(new_id int32) {
if commit.old_id != 0 {
SKIPPED_COMMITS.Add(commit.old_id)
} else {
SKIPPED_COMMITS.Add(commit.ele.id)
}
commit.ele.skip(new_id)
}
/*
reset refs/heads/main
from :12
*/
type Reset struct {
base *GitElements
ref string
from int32
}
func NewReset(ref_ string, from_ref_ int32) Reset {
base := NewGitElement()
base.types = "reset"
return Reset{
base: &base,
ref: ref_, // ref is string
from: from_ref_, // but from_ref is short mark id, optional exist
}
}
func (r *Reset) dump(writer io.WriteCloser) {
r.base.dumped = true
ref_line := fmt.Sprintf("reset%s\n", r.ref)
writer.Write([]byte(ref_line))
if r.from > 0 {
from_ref_line := fmt.Sprintf("from :%d\n", r.from)
writer.Write([]byte(from_ref_line))
writer.Write([]byte("\n"))
}
}
/*
tag v1.0.1
mark :13
from :12
original-oid 0e04e40bdf7cb956b36ed39b3063c253bd0d165c
tagger Li Linchao <lilinchao@oschina.cn> 1633941258 +0800
data 11
heavy tagg
tagger的格式为:<tagger_name> SP <tagger_email> SP <tagger_date>
tag 内容为数据块,大小固定(e.g. data 11,指定大小为11), 不包含LF
*/
type Tag struct {
ele *GitElementsWithID // mark_id, old_id, types, dumped
old_id int32 // mark_id too
tag_name string // tag name(ref) line: tag v1.0.1, tag refs/heads/main
from_ref int32 // from :id line
original_oid string
tagger string // tagger line
msg_size int32 // tager size is not as large as blob's
msg []byte // message line, raw bytes
}
func NewTag(tag_name_ string, from_ref_ int32, original_oid_, tagger_ string, size_ int32, msg_ []byte) Tag {
ele := NewGitElementsWithID()
ele.base.types = "tag"
return Tag{
ele: &ele,
old_id: ele.id, // old_id = current mark id
tag_name: tag_name_,
from_ref: from_ref_, // parent mark id
original_oid: original_oid_, // sha-1 id
tagger: tagger_,
msg_size: size_,
msg: msg_,
}
}
func (tag *Tag) dump(writer io.WriteCloser) {
tag.ele.base.dumped = true
HASH_ID[tag.original_oid] = tag.ele.id
ID_HASH[tag.ele.id] = tag.original_oid
tag_line := fmt.Sprintf("tag%s\n", tag.tag_name)
mark_line := fmt.Sprintf("mark :%d\n", tag.ele.id)
from_line := fmt.Sprintf("from :%d\n", tag.from_ref)
origin_oid := fmt.Sprintf("original-oid %s\n", tag.original_oid)
tagger_line := tag.tagger
data_line := fmt.Sprintf("data %d\n%s\n", tag.msg_size, tag.msg)
writer.Write([]byte(tag_line))
writer.Write([]byte(mark_line))
writer.Write([]byte(from_line))
writer.Write([]byte(origin_oid))
writer.Write([]byte(tagger_line))
writer.Write([]byte(data_line))
}
// ref_line are like:
// commit refs/xxx/
// reset refs/xxx/
// tag xxx
// ref types are: commit, reset, tag
func parse_ref_line(reftype, line string) (refname string) {
matches := Match(reftype+ref_re, line)
// don't match
if len(matches) == 0 {
return ""
}
// return literal ref, not its type
return matches[1]
}
// parent refs are like:
// from :parent_ref_id
// merge :parent_ref_id
// parent ref types are: from or merge
func parse_parent_ref(reftype, line string) (orig_ref, ref int32) {
matches := Match(reftype+" :"+ref_re, line)
// from 0000000000000000000000000000000000000000
if len(line) == 46 {
if line[5:len(line)-1] == "0000000000000000000000000000000000000000" {
// mark to delete
PrintLocalWithRedln("nested tags error")
os.Exit(1)
}
}
if len(matches) == 0 {
// don't matched parent ref line
return 0, 0
}
orig_baseref := matches[1]
origref, _ := strconv.Atoi(orig_baseref)
baseref := IDs.translate(int32(origref))
// return ref mark id, not the whole line
return int32(origref), baseref
}
func parse_mark(line string) (idx int32) {
matches := Match("mark :"+idx_re, line)
if len(matches) == 0 {
PrintLocalWithRedln("no match mark id")
return 0
}
if idx, err := strconv.Atoi(matches[1]); err == nil {
return int32(idx)
}
return 0
}
func parse_original_oid(line string) (oid string) {
matches := Match("original-oid "+oid_re, line)
if len(matches) == 0 {
PrintLocalWithRedln("no match original-oid")
return ""
}
// single oid string
return matches[1]
}
func parse_datasize(line string) int64 {
matches := Match("data "+idx_re, line)
if len(matches) == 0 {
PrintLocalWithRedln("no match data size")
return -1
}
size, err := strconv.ParseInt(matches[1], 10, 64)
if err != nil {
return -1
}
return size
}
// author, commiter, tagger
func parse_user(usertype, line string) (use string) {
matches := Match(usertype+" "+user_re, line)
if len(matches) == 0 {
return ""
}
if len(matches[0]) != 0 {
// return whole match line
return matches[0]
}
return ""
}
// file mode can be: M(modify), D(delete), C(copy), R(rename), A(add)
// here we only handle M,D and R mode
func parse_filechange(line string) FileChange {
arr := strings.Split(line, " ")
types := arr[0]
if types == "M" { // pattern: M mode :id path
mode := arr[1]
var parent_id string
if strings.HasPrefix(arr[2], ":") {
parent_id = arr[2][1:]
orig, _ := strconv.Atoi(parent_id)
IDs.translate(int32(orig))
} else { // pattern: M mode hash1-id path
parent_id = arr[2]
}
// **NOTE** special case: M 100644 :18 "bad dir/2.c
path := strings.Join(arr[3:], " ")
path = strings.TrimSuffix(path, "\n")
filechange := NewFileChange("M", mode, parent_id, path)
return filechange
} else if types == "D" { // pattern: D path
path := strings.Join(arr[1:], " ")
path = strings.TrimSuffix(arr[1], "\n")
filechange := NewFileChange("D", "", "", path)
return filechange
} else if types == "R" { // pattern: R old new
old_path := arr[1]
new_path := strings.TrimSuffix(arr[2], "\n")
filechange := NewFileChange("R", "", old_path, new_path)
return filechange
}
return FileChange{}
}
// parse raw blob data, return data size, data and err
// **NOTE**
// blob data size maybe zero, so the parsed data matches maybe like:
// [data 0
// 0]
// thus we use -1 to indicate parse error
func (iter *FEOutPutIter) parse_data(line string, size int64) (n int64, data, extra_msg []byte) {
var writer bytes.Buffer
var sum int64
newline := line
// if data size is 0, no need to read any data more, just go to next line
for size != 0 {
n, err := writer.Write([]byte(newline))
if err != nil {
PrintRedln(fmt.Sprint(err))
}
if n != len(newline) {
PrintLocalWithRedln("failed to write data")
}
sum += int64(n)
if sum == size {
break
}
if sum > size {
cur_linelen := int64(len(newline))
extra_linelen := sum - size
extra_msg = []byte(newline)[(cur_linelen - extra_linelen):]
return sum, bytes.TrimRight(writer.Bytes(), string(extra_msg)), extra_msg
}
newline, _ = iter.Next()
}
return sum, writer.Bytes(), extra_msg
}
func (iter *FEOutPutIter) parseBlob(line string) *Blob {
// go to next line
newline, _ := iter.Next()
mark_id := parse_mark(newline)
if mark_id == 0 {
fmt.Println("DEBUG: parse blob error: mark id should > 0")
return &Blob{}
}
newline, _ = iter.Next()
original_oid := parse_original_oid(newline)
if len(original_oid) == 0 {
fmt.Println("DEBUG: parse blob error: original oid should not empty")
return &Blob{}
}
newline, _ = iter.Next()
size := parse_datasize(newline)
newline, _ = iter.Next()
_, data_block, _ := iter.parse_data(newline, size)
sha256 := GenerateHash(data_block, "sha256sum")
blob := NewBlob(size, data_block, original_oid, sha256)
if mark_id > 0 {
blob.ele.old_id = mark_id
IDs.record_rename(mark_id, blob.ele.id)
}
return &blob
}
func (iter *FEOutPutIter) parseCommit(line string) (*Commit, *Helper_info) {
if line == "\n" {
line, _ = iter.Next()
}
branch := parse_ref_line("commit", line)
newline, _ := iter.Next()
mark_id := parse_mark(newline)
if mark_id == 0 {
PrintLocalWithRedln("no match mark id")
return &Commit{}, &Helper_info{}
}
newline, _ = iter.Next()
original_oid := parse_original_oid(newline)
newline, _ = iter.Next()
author := parse_user("author", newline)
newline, _ = iter.Next()
commiter := parse_user("committer", newline)
newline, _ = iter.Next()
size := parse_datasize(newline)
newline, _ = iter.Next()
actual_size, msg, tail_msg := iter.parse_data(newline, size)
// handle special case
var actual_msg []byte
var used bool
orig_parents := make([]int32, 0)
parents := make([]int32, 0)
if actual_size > size {
/*
**Special Case 2:**
data 99
!13 change readonly file
Merge pull request !13 from Cactusinhand/auto-7670704-master-1626686679876from :4 <-------------
merge :22
M 100644 :21 README.en.md
M 100644 :5 files/1.c
Tail msg: from :4
**Special Case 2:**
data 14
Initial commitM 100644 :1 .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md <-------------
M 100644 :2 README.en.md
M 100644 :3 README.md
Tail msg: M 100644 :1 .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md
*/
actual_msg = append(msg, '\n')
if match := Match("from :"+ref_re, string(tail_msg)); len(match) > 0 {
// get a from parent in extra_msg
// must use parse_parent_ref() method to parse it, otherwise will get a dump error in some case.
old_id, from_id := parse_parent_ref("from", string(tail_msg))
orig_parents = append(orig_parents, old_id)
parents = append(parents, from_id)
used = true
} else {
used = false
}
msg = actual_msg
}
// next line maybe parents or filechanges
newline, _ = iter.Next()
// from parent
if strings.HasPrefix(newline, "from") {
old_id, from_id := parse_parent_ref("from", newline)
orig_parents = append(orig_parents, old_id)
parents = append(parents, from_id)
newline, _ = iter.Next()
}
// merge parents
for strings.HasPrefix(newline, "merge") {
old_id, merge_id := parse_parent_ref("merge", newline)
orig_parents = append(orig_parents, old_id)
parents = append(parents, merge_id)
newline, _ = iter.Next()
}
if n := len(orig_parents); n == 0 {
if Lasted_commit[branch] > 0 {
parents = []int32{Lasted_commit[branch]}
}
}
if n := len(orig_parents); n == 0 {
if Lasted_orig_commit[branch] > 0 {
orig_parents = []int32{Lasted_orig_commit[branch]}
}
}
// parse filechanges
file_changes := make([]FileChange, 0)
var filechange FileChange
// if extra_msg is not empty and haven't been used, treat it as filechange
if len(tail_msg) > 1 && !used {
filechange = parse_filechange(string(tail_msg))
filechange.branch = branch
file_changes = append(file_changes, filechange)
}
for newline != "\n" {
filechange = parse_filechange(newline)
filechange.branch = branch
file_changes = append(file_changes, filechange)
newline, _ = iter.Next()
}
commit := NewCommit(original_oid, branch, author, commiter, int32(len(msg)),
msg, parents, file_changes)
if mark_id > 0 {
commit.old_id = mark_id
IDs.record_rename(mark_id, commit.ele.id)
}
hinfo := &Helper_info{
orig_parents: orig_parents,
has_filechange: len(commit.filechanges) != 0,
}
return &commit, hinfo
}
func (iter *FEOutPutIter) parseReset(line string) *Reset {
ref := parse_ref_line("reset", line)
// this reset is the first reset on the first commit
str, _ := iter.f.Peek(6)
if string(str) == "commit" {
reset := NewReset(ref, 0)
return &reset
}
// then countinue to parse from-line in reset structure
newline, _ := iter.Next()
_, parent_id := parse_parent_ref("from", newline)
if parent_id <= 0 {
delete(Lasted_commit, ref)
delete(Lasted_orig_commit, ref)
}
reset := NewReset(ref, parent_id)
Lasted_commit[reset.ref] = reset.from
Lasted_orig_commit[reset.ref] = reset.from
return &reset
}
func (iter *FEOutPutIter) parseTag(line string) *Tag {
tag_name := parse_ref_line("tag", line)
// go to next new line
newline, _ := iter.Next()
mark_id := parse_mark(newline)
if mark_id == 0 {
PrintLocalWithRedln("no match mark id")
}
newline, _ = iter.Next()
_, parent_id := parse_parent_ref("from", newline)
newline, _ = iter.Next()
original_oid := parse_original_oid(newline)
newline, _ = iter.Next()
tagger := parse_user("tagger", newline)
newline, _ = iter.Next()
size := parse_datasize(newline)
newline, _ = iter.Next()
actual_size, msg, _ := iter.parse_data(newline, size)
tag := NewTag(tag_name, parent_id, original_oid, tagger, int32(actual_size), msg)
// the parsed mark id from original source data is old id,
// cause the new id is generated by IDs.New() in NewTag()
if mark_id > 0 {
// new_id = tag.ele.id
tag.old_id = mark_id
// map[old_id] = to new_id
IDs.record_rename(mark_id, tag.ele.id)
} else {
tag.ele.skip(0)
}
return &tag
}
func (filter *RepoFilter) Parser() {
if filter.repo.opts.verbose {
if filter.repo.opts.lfs {
PrintLocalWithGreenln("start to migrate specified files")
} else {
PrintLocalWithGreenln("start to clean up specified files")
}
}
iter, err := filter.repo.NewFastExportIter()
if err != nil {
fmt.Fprint(os.Stdout, err)
}
defer iter.Close()
input, cmd, err := filter.repo.FastImportOut()
defer func() {
input.Close()
cmd.Wait()
}()
if err != nil {
PrintLocalWithRedln("run git-fast-import process failed")
}
for {
line, _ := iter.Next()
if matches := Match("feature done\n$", line); len(matches) != 0 {
continue
} else if matches := Match("^blob\n$", line); len(matches) != 0 {
blob := iter.parseBlob(line)
filter.tweak_blob(blob)
if blob.ele.base.dumped {
blob.dump(input)
}
} else if matches := Match("commit (.*)\n$", line); len(matches) != 0 {
commit, aux_info := iter.parseCommit(line)
filter.tweak_commit(commit, aux_info)
if commit.ele.base.dumped {
commit.dump(input)
}
} else if matches := Match("reset (.*)\n$", line); len(matches) != 0 {
reset := iter.parseReset(line)
filter.tweak_reset(reset)
if reset.base.dumped {
reset.dump(input)
}
} else if matches := Match("tag (.*)\n$", line); len(matches) != 0 {
tag := iter.parseTag(line)
filter.tweak_tag(tag)
if tag.ele.base.dumped {
tag.dump(input)
}
} else if matches := Match("done\n$", line); len(matches) != 0 {
iter.Close()
break
}
}
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Go
1
https://gitee.com/zqq61/git-repo-clean.git
git@gitee.com:zqq61/git-repo-clean.git
zqq61
git-repo-clean
git-repo-clean
main

搜索帮助