Skip to content

Commit

Permalink
WIP: Support converting repos to Git LFS
Browse files Browse the repository at this point in the history
Git LFS allows uses to commit new files to the LFS store, but replacing
_old_ files requires rewriting history, which is something the BFG is
pretty good at. This rough cut allows replacing blobs with pointer files
throughout repo history.

Some caveats with this initial implementation:

* the BFG cleans concurrently, files may unnecessarily be hashed more than once
* the working directory isn't updated
* specifying `-fi *.png` should be unnecessary, should use gitattributes
* need for `--no-blob-protection` is a hangover from normal BFG behaviour

Example invocation:

```
$ git clone https://github.com/guardian/membership-frontend.git
$ cd membership-frontend
$ java -jar bfg.jar --convert-to-git-lfs -fi *.png --no-blob-protection
...
$ ls .git/lfs/objects/ | head -2
0145f7c304ef33a43cc946e0a57b2213d24dcaf8462f3d3b332407a8b258369c
07010d5ddea536da56ebdbbb28386921c94abd476046a245b35cd47e8eb6e426
$ git reset --hard
$ cat frontend/assets/images/favicons/152x152.png
version https://git-lfs.github.com/spec/v1
oid sha256:0145f7c304ef33a43cc946e0a57b2213d24dcaf8462f3d3b332407a8b258369c
size 1935
$
```

https://git-lfs.github.com/
https://github.com/github/git-lfs/blob/5eb9bb01/docs/spec.md#the-pointer
  • Loading branch information
rtyley committed Apr 9, 2015
1 parent b9949fe commit 7242876
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) 2015 Roberto Tyley
*
* This file is part of 'BFG Repo-Cleaner' - a tool for removing large
* or troublesome blobs from Git repositories.
*
* BFG Repo-Cleaner is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* BFG Repo-Cleaner is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/ .
*/

package com.madgag.git.bfg.cleaner

import java.nio.charset.Charset
import java.security.{DigestInputStream, MessageDigest}

import com.google.common.io.ByteStreams
import com.madgag.git.ThreadLocalObjectDatabaseResources
import com.madgag.git.bfg.model.{FileName, TreeBlobEntry}
import org.apache.commons.codec.binary.Hex.encodeHexString
import org.eclipse.jgit.lib.Constants.OBJ_BLOB
import org.eclipse.jgit.lib.ObjectLoader

import scala.util.Try
import scalax.file.Path
import scalax.file.Path.createTempFile
import scalax.io.Resource

trait LfsBlobConverter extends TreeBlobModifier {

val threadLocalObjectDBResources: ThreadLocalObjectDatabaseResources

val lfsSuitableFiles: (FileName => Boolean)

val charset = Charset.forName("UTF-8")

val lfsObjectsDir: Path

override def fix(entry: TreeBlobEntry) = {
val oid = (for {
_ <- Some(entry.filename) filter lfsSuitableFiles
loader = threadLocalObjectDBResources.reader().open(entry.objectId)
(shaHex, lfsPath) <- buildLfsFileFrom(loader)
} yield {
val pointer =
s"""|version https://git-lfs.github.com/spec/v1
|oid sha256:$shaHex
|size ${loader.getSize}
|""".stripMargin

threadLocalObjectDBResources.inserter().insert(OBJ_BLOB, pointer.getBytes(charset))
}).getOrElse(entry.objectId)

(entry.mode, oid)
}

def buildLfsFileFrom(loader: ObjectLoader): Option[(String, Path)] = {
val tmpFile = createTempFile()

val digest = MessageDigest.getInstance("SHA-256")

for {
inStream <- Resource.fromInputStream(new DigestInputStream(loader.openStream(), digest))
outStream <- tmpFile.outputStream()
} ByteStreams.copy(inStream, outStream)

val shaHex = encodeHexString(digest.digest())

val lfsPath = lfsObjectsDir / shaHex

val ensureLfsFile = Try(if (!lfsPath.exists) tmpFile moveTo lfsPath).recover {
case _ => lfsPath.size.contains(loader.getSize)
}

Try(tmpFile.delete(force = true))

for (_ <- ensureLfsFile.toOption) yield shaHex -> lfsPath
}
}
16 changes: 15 additions & 1 deletion bfg/src/main/scala/com/madgag/git/bfg/cli/CLIConfig.scala
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ object CLIConfig {
fileMatcher("delete-folders").text("delete folders with the specified names (eg '.svn', '*-tmp' - matches on folder name, not path within repo)").action {
(v, c) => c.copy(deleteFolders = Some(v))
}
opt[Unit]("convert-to-git-lfs").text("experimental support for Git LFS, use with '-fi' to specify files").hidden().action {
(_, c) => c.copy(lfsConversion = true)
}
opt[File]("replace-text").abbr("rt").valueName("<expressions-file>").text("filter content of files, replacing matched text. Match expressions should be listed in the file, one expression per line - " +
"by default, each expression is treated as a literal, but 'regex:' & 'glob:' prefixes are supported, with '==>' to specify a replacement " +
"string other than the default of '***REMOVED***'.").action {
Expand Down Expand Up @@ -129,6 +132,7 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None,
filterSizeThreshold: Int = BlobTextModifier.DefaultSizeThreshold,
textReplacementExpressions: Traversable[String] = List.empty,
stripBlobsWithIds: Option[Set[ObjectId]] = None,
lfsConversion: Boolean = false,
strictObjectChecking: Boolean = false,
sensitiveData: Option[Boolean] = None,
massiveNonFileObjects: Option[Int] = None,
Expand Down Expand Up @@ -172,6 +176,16 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None,
}
}

lazy val lfsBlobConverter: Option[LfsBlobConverter] = if (lfsConversion) Some {
new LfsBlobConverter {
val lfsObjectsDir = repo.getDirectory / "lfs" / "objects"

val lfsSuitableFiles = filterContentPredicate

val threadLocalObjectDBResources = repo.getObjectDatabase.threadLocalResources
}
} else None

lazy val privateDataRemoval = sensitiveData.getOrElse(Seq(fileDeletion, folderDeletion, blobTextModifier).flatten.nonEmpty)

lazy val objectIdSubstitutor = if (privateDataRemoval) ObjectIdSubstitutor.OldIdsPrivate else ObjectIdSubstitutor.OldIdsPublic
Expand Down Expand Up @@ -209,7 +223,7 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None,
}
}

Seq(blobsByIdRemover, blobRemover, fileDeletion, blobTextModifier).flatten
Seq(blobsByIdRemover, blobRemover, fileDeletion, blobTextModifier, lfsBlobConverter).flatten
}

lazy val definesNoWork = treeBlobCleaners.isEmpty && folderDeletion.isEmpty && treeEntryListCleaners.isEmpty
Expand Down
Binary file not shown.
16 changes: 14 additions & 2 deletions bfg/src/test/scala/com/madgag/git/bfg/cli/MainSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@

package com.madgag.git.bfg.cli

import com.madgag.git._
import com.madgag.git.bfg.cli.test.unpackedRepo
import org.specs2.mutable._

import scalax.file.ImplicitConversions._
import scalax.file.Path
import com.madgag.git._
import bfg.cli.test.unpackedRepo


class MainSpec extends Specification {

Expand Down Expand Up @@ -52,6 +55,15 @@ class MainSpec extends Specification {
}
}

"convert big blobs to the Git LFS format" in new unpackedRepo("/sample-repos/repoWithBigBlobs.git.zip") {
ensureRemovalOfBadEggs(packedBlobsOfSize(11238), contain(exactly(abbrId("596c")))) {
run("--convert-to-git-lfs --filter-content-including *.png --no-blob-protection")
}
val lfsFile = repo.getDirectory / "lfs" / "objects" / "e0ebd49837a1cced34b9e7d3ff2fa68a8100df8f158f165ce139e366a941ba6e"

lfsFile.size must beSome(11238)
}

"remove bad folder named '.git'" in new unpackedRepo("/sample-repos/badRepoContainingDotGitFolder.git.zip") {
ensureRemovalOf(commitHistory(haveFolder(".git").atLeastOnce)) {
run("--delete-folders .git --no-blob-protection")
Expand Down

0 comments on commit 7242876

Please sign in to comment.