git-fat

#!/usr/bin/env python
# -*- mode:python -*-

from __future__ import print_function, with_statement, unicode_literals

import argparse
import collections
import hashlib
import itertools
import json
import os
import subprocess
import sys
import tempfile
import threading
import time

if sys.version_info[0] == 3:  # Python 3
    from configparser import ConfigParser, NoSectionError, NoOptionError
    unicode = str
else:  # Python 2
    from ConfigParser import ConfigParser, NoSectionError, NoOptionError
    from io import open

    json.decoder.JSONDecodeError = ValueError


class DataTransferDirection:
    PULL = 1
    PUSH = 2


def touni(s, encoding="utf8"):
    """Automate unicode conversion"""
    if isinstance(s, (str, unicode)):
        return s
    if hasattr(s, "decode"):
        return s.decode(encoding)
    raise ValueError("Cound not decode")


def tobytes(s, encoding="utf8"):
    """Automatic byte conversion"""
    if isinstance(s, bytes):
        return s
    if hasattr(s, "encode"):
        return s.encode(encoding)
    raise ValueError("Could not encode")


BLOCK_SIZE = 4096


def which(program):
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


def hash2fname(hvalue):
    """
    Convert SHA-1 has to filename. Follow git convention
    of using the first 2 characters as folder name.
    """
    fpath = os.path.join(hvalue[:2], hvalue[2:])
    return fpath


def verbose_stderr(*args, **kwargs):
    return print(*args, file=sys.stderr, **kwargs)


def verbose_ignore(*args, **kwargs):
    pass


def mkdir_p(path):
    import errno

    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def umask():
    """Get umask without changing it."""
    old = os.umask(0)
    os.umask(old)
    return old


def readblocks(stream):
    bytes = 0
    while True:
        data = stream.read(BLOCK_SIZE)
        bytes += len(data)
        if not data:
            break
        yield data


def cat_iter(initer, outstream):
    for block in initer:
        outstream.write(block)


def cat(instream, outstream):
    return cat_iter(readblocks(instream), outstream)


def difftreez_reader(input):
    """Incremental reader for git diff-tree -z output

    :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
    """
    buffer = []
    partial = ""
    while True:
        newread = input.read(BLOCK_SIZE)
        if not newread:
            break
        newread = touni(newread)
        partial += newread
        while True:
            head, sep, partial = partial.partition("\0")
            if not sep:
                partial = head
                break
            buffer.append(head)
            if len(buffer) == 2:
                oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
                path = buffer[1]
                yield (newhash, modflag, path)
                buffer = []


def gitconfig_get(name, file=None):
    args = ["git", "config", "--get"]
    if file is not None:
        args += ["--file", file]
    args.append(name)
    p = subprocess.Popen(args, stdout=subprocess.PIPE)
    output = p.communicate()[0].strip()
    if p.returncode and file is None:
        return None
    elif p.returncode:
        return gitconfig_get(name)
    else:
        return touni(output)


def gitconfig_set(name, value, file=None):
    args = ["git", "config"]
    if file is not None:
        args += ["--file", file]
    args += [name, value]
    subprocess.check_call(args)


# TODO refactor these functions as class members to avoid parsing the config everytime
def config_has_section(file, sect):
    if not os.path.isfile(file):
        sys.stderr.write("Could not find configuration at: %s\n" % file)
        sys.exit(1)
    config = ConfigParser()
    config.read(file)
    return config.has_section(sect)


def config_has_option(file, sect, opt):
    if not os.path.isfile(file):
        sys.stderr.write("Could not find configuration at: %s\n" % file)
        sys.exit(1)
    config = ConfigParser()
    config.read(file)
    return config.has_option(sect, opt)


def config_get(file, sect, key):
    if not os.path.isfile(file):
        sys.stderr.write("Could not find configuration at: %s\n" % file)
        sys.exit(1)
    config = ConfigParser()
    config.read(file)
    try:
        return config.get(sect, key)
    except (NoSectionError, NoOptionError):
        return None


def config_get_options_from_section(file, sect):
    if not os.path.isfile(file):
        sys.stderr.write("Could not find configuration at: %s\n" % file)
        sys.exit(1)
    config = ConfigParser()
    config.read(file)
    return config.items(sect)


def config_set(file, sect, key, val):
    if not os.path.isfile(file):
        sys.stderr.write("Could not find configuration at: %s\n" % file)
        sys.exit(1)
    config = ConfigParser()
    config.read(file)
    config.set(sect, key, val)
    with open(file, 'w') as out:
        config.write(out)


class GitFat(object):
    DecodeError = RuntimeError

    def __init__(self):
        self.is_verbose = os.environ.get("GIT_FAT_VERBOSE")
        self.verbose = verbose_stderr if self.is_verbose else verbose_ignore
        try:
            self.gitroot = subprocess.check_output(
                "git rev-parse --show-toplevel".split()
            ).strip()
            self.gitroot = touni(self.gitroot)
        except subprocess.CalledProcessError:
            sys.exit(1)
        self.gitdir = subprocess.check_output("git rev-parse --git-dir".split()).strip()
        self.gitdir = touni(self.gitdir)
        self.objdir = os.path.join(self.gitdir, "fat", "objects")
        self.encode = self.encode_v2

        def magiclen(enc):
            return len(enc(hashlib.sha1(b"dummy").hexdigest(), 5))

        self.magiclen = magiclen(self.encode)  # Current version
        self.magiclens = [magiclen(enc) for enc in [self.encode_v2]]

    def setup(self):
        mkdir_p(self.objdir)

    def is_init_done(self):
        return gitconfig_get("filter.fat.clean") or gitconfig_get("filter.fat.smudge")

    def assert_init_done(self):
        if not self.is_init_done():
            sys.stderr.write(
                "fatal: git-fat is not yet configured in this repository.\n"
            )
            sys.stderr.write('Run "git fat init" to configure.\n')
            sys.exit(1)

    def get_rsync(self):
        cfgpath = os.path.join(self.gitroot, ".gitfat")
        remote = gitconfig_get("rsync.remote", file=cfgpath)
        ssh_port = gitconfig_get("rsync.sshport", file=cfgpath)
        ssh_user = gitconfig_get("rsync.sshuser", file=cfgpath)
        options = gitconfig_get("rsync.options", file=cfgpath)
        if remote is None:
            raise RuntimeError("No rsync.remote in %s" % cfgpath)
        return remote, ssh_port, ssh_user, options

    def local_sha1_checksum(self, path):
        # return hashlib.md5(open(path, "rb").read()).hexdigest()
        # Cheat here
        return path

    def get_all_remote_files(self, bucket):
        if bucket.startswith("s3://"):
            bucket = bucket.replace("s3://", "")

        cmd = ["aws", "s3api", "list-objects-v2", "--bucket", bucket]
        output = []

        initial_output = subprocess.check_output(cmd)
        initial_output = touni(initial_output)

        try:
            processed = json.loads(initial_output)
        except json.decoder.JSONDecodeError:
            processed = {}

        for item in processed.get("Contents", []):
            output.append(item)

        next_token = processed.get("NextToken", None)
        while True:
            if next_token:
                page_command = cmd + ["--starting-token", next_token]
                out = subprocess.check_output(page_command)
                processed = json.loads(out)
                for item in processed.get("Contents", []):
                    output.append(item)
                next_token = processed.get("NextToken", None)
            else:
                break

        return output

    def get_remote_item(self, file, remote_files):
        for remote_file in remote_files:
            if remote_file.get("Key", "") == file:
                return remote_file
        return {}

    def get_aws_cmd(self, direction, s3_bucket, files):
        if not which("aws"):
            sys.stderr.write("Could not find aws cli install.\n")
            sys.exit(1)

        if not s3_bucket.startswith("s3://"):
            s3_bucket = "s3://{}".format(s3_bucket)
        cmds = []
        remote_files = self.get_all_remote_files(s3_bucket)

        if direction == DataTransferDirection.PUSH:
            self.verbose("Pushing to %s" % s3_bucket)
            for file in files:
                self.verbose("Processing {}".format(file))
                local_path = os.path.join(self.objdir, file)
                bucket_path = s3_bucket + "/" + file
                cmd = ["aws", "s3", "cp", local_path, bucket_path]

                remote_item = self.get_remote_item(file, remote_files)
                if not remote_item:
                    # File is not on remote
                    cmds.append(cmd)
                    continue

                if int(remote_item.get("Size")) != os.path.getsize(local_path):
                    cmds.append(cmd)
        else:
            self.verbose("Pulling from %s" % s3_bucket)
            for file in files:
                self.verbose("Processing {}".format(file))
                local_path = os.path.join(self.objdir, file)
                bucket_path = s3_bucket + "/" + file
                cmd = ["aws", "s3", "cp", bucket_path, local_path]
                if not os.path.exists(local_path):
                    cmds.append(cmd)
                    continue
                remote_item = self.get_remote_item(file, remote_files)
                self.verbose(remote_item)
                self.verbose(
                    "{} is {} bytes".format(local_path, os.path.getsize(local_path))
                )
                if int(remote_item.get("Size")) != os.path.getsize(local_path):
                    cmds.append(cmd)
        return cmds

    def get_rsync_command(self, direction):

        (remote, ssh_port, ssh_user, options) = self.get_rsync()
        if direction == DataTransferDirection.PUSH:
            self.verbose("Pushing to %s" % remote)
        else:
            self.verbose("Pulling from %s" % remote)

        cmd = ["rsync", "--progress", "--ignore-existing", "--from0", "--files-from=-"]
        rshopts = ""
        if ssh_user:
            rshopts += " -l " + ssh_user
        if ssh_port:
            rshopts += " -p " + ssh_port
        if rshopts:
            cmd.append("--rsh=ssh" + rshopts)
        if options:
            cmd += options.split(" ")
        if direction == DataTransferDirection.PUSH:
            cmd += [self.objdir + "/", remote + "/"]
        else:
            cmd += [remote + "/", self.objdir + "/"]
        return [cmd]

    def get_rclone_cmd(self, direction, files):

        if not which("rclone"):
            sys.stderr.write("Could not find rclone install.\n")
            sys.exit(1)

        cfgpath = os.path.join(self.gitroot, ".gitfat")
        local_rclone_cfgpath = os.path.join(self.gitroot, ".rclone.conf")
        remote_name = config_get(cfgpath, "rclone", "remote") or "auto"
        remote_dir = config_get(cfgpath, "rclone", "remotedir") or ""

        # If the rclone config is not cached locally, create it
        if not os.path.isfile(local_rclone_cfgpath):
            # Use the config file pointed in .gitfat, if specified
            rclone_config = config_get(cfgpath, "rclone", "config")
            if rclone_config:
                # Cache it locally
                with open(local_rclone_cfgpath, 'w') as outf:
                    subprocess.call(
                        ["rclone", "config", "show", remote_name], stdout=outf
                    )
            else:
                # If 'config' field absent under [rclone], parse the other fields
                # to setup rclone locally
                opts = config_get_options_from_section(cfgpath, "rclone")
                local_rclone_cfg = ConfigParser()
                local_rclone_cfg.add_section(remote_name)
                for k, v in opts:
                    if k in ("remote", "remotedir", "config"):
                        continue
                    local_rclone_cfg.set(remote_name, k, v)
                openmode = "w" if sys.version_info[0] == 3 else "wb"
                with open(local_rclone_cfgpath, openmode) as outf:
                    local_rclone_cfg.write(outf)

        if direction == DataTransferDirection.PUSH:
            self.verbose("Pushing to %s" % remote_name)
        else:
            self.verbose("Pulling from %s" % remote_name)

        cmds = []
        for file in files:
            self.verbose("Processing {}".format(file))
            cmd = ["rclone", "copy"]
            if local_rclone_cfgpath:
                cmd.extend(["--config", local_rclone_cfgpath])
            if direction == DataTransferDirection.PUSH:
                local_path = os.path.join(self.objdir, file)
                remote_path = os.path.join(remote_dir, file)
                remote_fullpath = "{}:{}".format(remote_name, os.path.dirname(remote_path))
                cmd.extend([local_path, remote_fullpath])
            else:
                local_path = os.path.join(self.objdir, os.path.dirname(file))
                remote_path = os.path.join(remote_dir, file)
                remote_fullpath = "{}:{}".format(remote_name, remote_path)
                cmd.extend([remote_fullpath, local_path])
            cmds.append(cmd)
            # TODO skip files that are already in the destination

        return cmds

    def get_data_transfer_command(self, direction, files):

        cfgpath = os.path.join(self.gitroot, ".gitfat")

        # Check if the backend is rsync
        is_rsync = config_has_section(cfgpath, "rsync")
        if is_rsync:
            return self.get_rsync_command(direction)

        # Check if the backend is S3
        is_s3 = config_has_section(cfgpath, "s3")
        if is_s3:
            s3_bucket = config_get(cfgpath, "s3", "bucket")
            return self.get_aws_cmd(direction, s3_bucket, files)

        # Check if the backend is rclone
        is_rclone = config_has_section(cfgpath, "rclone")
        if is_rclone:
            return self.get_rclone_cmd(direction, files)

        raise ValueError("Couldn't figure out the backend. Please check .gitfat")

    def revparse(self, revname):
        return touni(subprocess.check_output(["git", "rev-parse", revname]).strip())

    def encode_v2(self, digest, bytes):
        """
        Produce representation of file to be stored in repository. 20 characters can
        hold 64-bit integers.
        """
        return "#$# git-fat %s %20d\n" % (digest, bytes)

    def decode(self, string, noraise=False):
        cookie = "#$# git-fat "
        string = touni(string)
        if string.startswith(cookie):
            parts = string[len(cookie) :].split()
            digest = parts[0]
            bytes = int(parts[1]) if len(parts) > 1 else None
            return digest, bytes
        elif noraise:
            return None, None
        else:
            raise GitFat.DecodeError("Could not decode %s" % (string))

    def decode_stream(self, stream):
        """
        Return digest if git-fat cache, otherwise iterator over the entire file contents
        """
        preamble = stream.read(self.magiclen)
        try:
            return self.decode(preamble)
        except GitFat.DecodeError:
            # Not sure if this is the right behavior
            return itertools.chain([preamble], readblocks(stream)), None

    def decode_file(self, fname):
        # Fast check
        try:
            stat = os.lstat(fname)
        except OSError:
            return False, None
        if stat.st_size != self.magiclen:
            return False, None
        # read file
        try:
            digest, bytes = self.decode_stream(open(fname, "rb"))
        except IOError:
            return False, None
        if isinstance(digest, str):
            return digest, bytes
        else:
            return None, bytes

    def decode_clean(self, body):
        """
        Attempt to decode version in working tree. The tree version could be changed to
        have a more useful message than the machine-readable copy that goes into
        the repository. If the tree version decodes successfully, it indicates that
        the fat data is not currently available in this repository.
        """
        digest, bytes = self.decode(body, noraise=True)
        return digest

    def filter_clean(self, instream, outstreamclean):
        h = hashlib.new("sha1")
        bytes = 0
        fd, tmpname = tempfile.mkstemp(dir=self.objdir)
        ishanging = False
        cached = False  # changes to True when file is cached
        try:
            with os.fdopen(fd, "wb") as cache:
                outstream = cache
                firstblock = True
                for block in readblocks(instream):
                    if firstblock:
                        if len(block) == self.magiclen and self.decode_clean(
                                block[0 : self.magiclen]
                        ):
                            # Working tree version is verbatim from repo (not smudged)
                            ishanging = True
                            outstream = outstreamclean
                        firstblock = False
                    h.update(block)
                    bytes += len(block)
                    outstream.write(block)
                outstream.flush()
            digest = h.hexdigest()
            objfile = os.path.join(self.objdir, hash2fname(digest))
            if not ishanging:
                if os.path.exists(objfile):
                    self.verbose(
                        "git-fat filter-clean: cache already exists %s" % objfile
                    )
                    os.remove(tmpname)
                else:
                    # Set permissions for the new file using the current umask
                    os.chmod(tmpname, int("444", 8) & ~umask())
                    mkdir_p(os.path.dirname(objfile))
                    os.rename(tmpname, objfile)
                    self.verbose("git-fat filter-clean: caching to %s" % objfile)
                cached = True
                outstreamclean.write(tobytes(self.encode(digest, bytes)))
        finally:
            if not cached:
                os.remove(tmpname)

    def referenced_objects(self, rev=None, all=False):
        referenced = set()
        if all:
            rev = "--all"
        elif rev is None:
            rev = self.revparse("HEAD")
        # Revision list gives us object names to inspect with cat-file...
        p1 = subprocess.Popen(
            ["git", "rev-list", "--objects", rev], stdout=subprocess.PIPE
        )

        def cut_sha1hash(input, output):
            for line in input:
                line = touni(line)
                line = line.split()[0] + "\n"
                output.write(tobytes(line))
            output.close()

        # ...`cat-file --batch-check` filters for git-fat object candidates in bulk...
        p2 = subprocess.Popen(
            ["git", "cat-file", "--batch-check"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )

        def filter_gitfat_candidates(input, output):
            for line in input:
                line = touni(line)
                objhash, objtype, size = line.split()
                if objtype == "blob" and int(size) in self.magiclens:
                    output.write(tobytes(objhash + "\n"))
            output.close()

        # ...`cat-file --batch` provides full contents of git-fat candidates in bulk
        p3 = subprocess.Popen(
            ["git", "cat-file", "--batch"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )
        # Stream data: p1 | cut_thread | p2 | filter_thread | p3
        cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
        filter_thread = threading.Thread(
            target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)
        )
        cut_thread.start()
        filter_thread.start()
        # Process metadata + content format provided by `cat-file --batch`
        while True:
            metadata_line = p3.stdout.readline()
            if not metadata_line:
                break  # EOF
            objhash, objtype, size_str = touni(metadata_line).split()
            size, bytes_read = int(size_str), 0
            # We know from filter that item is a candidate git-fat object and
            # is small enough to read into memory and process
            content = b""
            while bytes_read < size:
                data = p3.stdout.read(size - bytes_read)
                if not data:
                    break  # EOF
                content += data
                bytes_read += len(data)
            try:
                fathash = touni(self.decode(content)[0])
                referenced.add(hash2fname(fathash))
            except GitFat.DecodeError:
                pass
            # Consume LF record delimiter in `cat-file --batch` output
            bytes_read = 0
            while bytes_read < 1:
                data = p3.stdout.read(1)
                if not data:
                    break  # EOF
                bytes_read += len(data)
        # Ensure everything is cleaned up
        cut_thread.join()
        filter_thread.join()
        p1.wait()
        p2.wait()
        p3.wait()
        return referenced

    def orphan_files(self, patterns=[]):
        "generator for all orphan placeholders in the working tree"
        if not patterns or patterns == [""]:
            patterns = ["."]
        for fname in subprocess.check_output(
            ["git", "ls-files", "-z"] + patterns
        ).split(b"\x00")[:-1]:
            fname = touni(fname)
            digest = self.decode_file(fname)[0]
            if digest:
                yield (digest, fname)

    def catalog_objects(self):
        catalog = set()
        for x in os.listdir(self.objdir):
            subdir = os.path.join(self.objdir, x)
            if len(x) == 2 and os.path.isdir(subdir):
                for y in os.listdir(subdir):
                    catalog.add(os.path.join(x, y))
        return catalog

    def is_dirty(self):
        return subprocess.call(["git", "diff-index", "--quiet", "HEAD"]) == 0

    def checkout(self, show_orphans=False):
        "Update any stale files in the present working tree"
        self.assert_init_done()
        orphan_files_is_accessible = []
        for digest, fname in self.orphan_files():
            objname = hash2fname(digest)
            objpath = os.path.join(self.objdir, objname)
            orphan_files_is_accessible.append(
                (digest, fname, os.access(objpath, os.R_OK))
            )
        filenames_to_restore = [
            fname
            for digest, fname, accessible in orphan_files_is_accessible
            if accessible
        ]

        for fname in filenames_to_restore:
            # The output of our smudge filter depends on the existence of
            # the file in .git/fat/objects, but git caches the file stat
            # from the previous time the file was smudged, therefore it
            # won't try to re-smudge. I don't know a git command that
            # specifically invalidates that cache, but changing the mtime
            # on the file will invalidate the cache.
            # Here we set the mtime to mtime + 1. This is an improvement
            # over touching the file as it catches the edgecase where a
            # git-checkout happens within the same second as a git fat
            # checkout.
            stat = os.lstat(fname)
            os.utime(fname, (stat.st_atime, stat.st_mtime + 1))

        if show_orphans:
            for digest, fname, accessible in orphan_files_is_accessible:
                if not accessible:
                    print("Data unavailable: %s %s" % (digest, fname))

        filenames_nullterm = b"\x00".join(tobytes(f) for f in filenames_to_restore)

        if not self.is_verbose:
            print("Restoring %d Files" % (len(filenames_to_restore),))
        else:
            print("Restoring %d Files:" % (len(filenames_to_restore),))
            for digest, fname, accessible in orphan_files_is_accessible:
                if accessible:
                    print("%s: %s" % (digest, fname))

        if filenames_to_restore:
            # This re-smudge is essentially a copy that restores permissions.
            cmd = ["git", "checkout-index", "--stdin", "-z", "--index", "--force"]
            p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
            p.communicate(filenames_nullterm)
            retcode = p.wait()
            if retcode != 0:
                error = subprocess.CalledProcessError(retcode, " ".join(cmd))
                raise error

    def parse_pull_patterns(self, args):
        if "--" not in args:
            return [""]
        else:
            idx = args.index("--")
            patterns = args[idx + 1 :]  # we don't care about '--'
            return patterns

    def filter_objects(self, refargs, patterns):
        files = self.referenced_objects(**refargs) - self.catalog_objects()
        if refargs.get(
            "all"
        ):  # Currently ignores patterns; can we efficiently do both?
            return files
        orphans_matched = list(self.orphan_files(patterns))
        orphans_objects = set(map(lambda x: hash2fname(x[0]), orphans_matched))
        return files & orphans_objects

    def gen_large_blobs(self, revs, threshsize):
        """Build dict of all blobs"""
        time0 = time.time()

        def hash_only(input, output):
            """
            The output of git rev-list --objects shows extra info for blobs,
            subdirectory trees, and tags. This truncates to one hash per line.
            """
            for line in input:
                output.write(line[:40] + b"\n")
            output.close()

        revlist = subprocess.Popen(
            ["git", "rev-list", "--all", "--objects"],
            stdout=subprocess.PIPE,
            bufsize=-1,
        )
        objcheck = subprocess.Popen(
            ["git", "cat-file", "--batch-check"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            bufsize=-1,
        )
        hashonly = threading.Thread(
            target=hash_only, args=(revlist.stdout, objcheck.stdin)
        )
        hashonly.start()
        numblobs = 0
        numlarge = 1
        # Build dict with the sizes of all large blobs
        for line in objcheck.stdout:
            line = touni(line)
            objhash, blob, size = line.split()
            if blob != "blob":
                continue
            size = int(size)
            numblobs += 1
            if size > threshsize:
                numlarge += 1
                yield objhash, size
        revlist.wait()
        objcheck.wait()
        hashonly.join()
        time1 = time.time()
        self.verbose(
            "%d of %d blobs are >= %d bytes [elapsed %.3fs]"
            % (numlarge, numblobs, threshsize, time1 - time0)
        )

    def cmd_init(self):
        """
        Configure git-fat
        """
        self.setup()
        if self.is_init_done():
            print("Git fat already configured, check configuration in .git/config")
        else:
            gitconfig_set("filter.fat.clean", "git-fat filter-clean")
            gitconfig_set("filter.fat.smudge", "git-fat filter-smudge")
            print("Initialized git fat")

    def cmd_filter_clean(self):
        """
        The clean filter runs when a file is added to the index.
        It gets the "smudged" (tree) version of the file on stdin and produces the
        "clean" (repository) version on stdout.
        """
        self.setup()
        if hasattr(sys.stdin, "buffer"):
            stdin, stdout = sys.stdin.buffer, sys.stdout.buffer
        else:
            stdin, stdout = sys.stdin, sys.stdout
        self.filter_clean(stdin, stdout)

    def cmd_filter_smudge(self):
        self.setup()
        if hasattr(sys.stdin, "buffer"):
            stdin, stdout = sys.stdin.buffer, sys.stdout.buffer
        else:
            stdin, stdout = sys.stdin, sys.stdout
        result, bytes = self.decode_stream(stdin)
        if isinstance(result, str):  # We got a digest
            objfile = os.path.join(self.objdir, hash2fname(result))
            try:
                cat(open(objfile, "rb"), stdout)
                self.verbose("git-fat filter-smudge: restoring from %s" % objfile)
            except IOError:  # file not found
                self.verbose("git-fat filter-smudge: fat object missing %s" % objfile)
                stdout.write(
                    tobytes(self.encode(result, bytes))
                )  # could leave a better notice about how to recover this file
        else:  # We have an iterable over the original input.
            self.verbose("git-fat filter-smudge: not a managed file")
            cat_iter(result, stdout)

    def cmd_status(self, show_all=False):
        self.setup()
        catalog = self.catalog_objects()
        refargs = dict()
        if show_all:
            refargs["all"] = True
        referenced = self.referenced_objects(**refargs)
        garbage = catalog - referenced
        orphans = referenced - catalog
        if show_all:
            for obj in referenced:
                print(obj)
        if orphans:
            print("Orphan objects:")
            for orph in orphans:
                print("    " + orph)
        if garbage:
            print("Garbage objects:")
            for g in garbage:
                print("    " + g)

    def cmd_push(self, push_all=False):
        """
        Push anything that I have stored and referenced
        """
        self.setup()
        # Default to push only those objects referenced by current HEAD
        # (includes history). Finer-grained pushing would be useful.
        files = self.referenced_objects(all=push_all) & self.catalog_objects()
        cmds = self.get_data_transfer_command(
            direction=DataTransferDirection.PUSH,
            files=files
        )
        total = len(cmds)
        counter = 0
        for cmd in cmds:
            counter = counter + 1
            self.verbose("Executing file {} of {}".format(counter, total))
            self.verbose("Executing: %s" % " ".join(cmd))
            p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
            p.communicate(input=b"\x00".join(tobytes(f) for f in files))
            if p.returncode:
                sys.exit(p.returncode)

    def cmd_pull(self, pull_all=False, pull_args=None):
        """
        Pull anything that I have referenced, but not stored
        """
        self.setup()
        refargs = dict()
        if pull_args is None:
            pull_args = []
        if pull_all:
            refargs["all"] = True
        for arg in pull_args:
            if arg.startswith("-") or len(arg) != 40:
                continue
            rev = self.revparse(arg)
            if rev:
                refargs["rev"] = rev
        files = self.filter_objects(refargs, self.parse_pull_patterns(pull_args))
        cmds = self.get_data_transfer_command(
            direction=DataTransferDirection.PULL,
            files=files
        )
        for cmd in cmds:
            self.verbose("Executing: %s" % " ".join(cmd))
            p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
            p.communicate(input=b"\x00".join(tobytes(f) for f in files))
            if p.returncode:
                sys.exit(p.returncode)
        self.checkout()

    def cmd_checkout(self):
        self.checkout(show_orphans=True)

    def cmd_gc(self):
        garbage = self.catalog_objects() - self.referenced_objects()
        print("Unreferenced objects to remove: %d" % len(garbage))
        for obj in garbage:
            fname = os.path.join(self.objdir, obj)
            print("%10d %s" % (os.stat(fname).st_size, obj))
            os.remove(fname)

    def cmd_verify(self):
        """Print details of git-fat objects with incorrect data hash"""
        corrupted_objects = []
        for obj in self.catalog_objects():
            fname = os.path.join(self.objdir, obj)
            h = hashlib.new("sha1")
            for block in readblocks(open(fname, "rb")):
                h.update(block)
            expected_hash = obj.replace("/", "")
            data_hash = h.hexdigest()
            if expected_hash != data_hash:
                corrupted_objects.append((expected_hash, data_hash))
        if corrupted_objects:
            print("Corrupted objects: %d" % len(corrupted_objects))
            for expected_hash, data_hash in corrupted_objects:
                print(
                    "Found object stored with hash %s, but the hash of its content "
                    "is actually %s" % (expected_hash, data_hash)
                )
            sys.exit(1)

    def cmd_find(self, size_thresh):
        blobsizes = dict(self.gen_large_blobs("--all", size_thresh))
        time0 = time.time()
        # Find all names assumed by large blobs (those in blobsizes)
        pathsizes = collections.defaultdict(lambda: set())
        revlist = subprocess.Popen(
            ["git", "rev-list", "--all"], stdout=subprocess.PIPE, bufsize=-1
        )
        difftree = subprocess.Popen(
            [
                "git",
                "diff-tree",
                "--root",
                "--no-renames",
                "--no-commit-id",
                "--diff-filter=AMCR",
                "-r",
                "--stdin",
                "-z",
            ],
            stdin=revlist.stdout,
            stdout=subprocess.PIPE,
        )
        for newblob, modflag, path in difftreez_reader(difftree.stdout):
            bsize = blobsizes.get(newblob)
            if bsize:  # We care about this blob
                pathsizes[path].add(bsize)
        time1 = time.time()
        self.verbose("Found %d paths in %.3f s" % (len(pathsizes), time1 - time0))
        maxlen = max(map(len, pathsizes)) if pathsizes else 0
        for path, sizes in sorted(
            pathsizes.items(), key=lambda ps: max(ps[1]), reverse=True
        ):
            print(
                "%-*s filter.fat -text # %10d %d"
                % (maxlen, path, max(sizes), len(sizes))
            )
        revlist.wait()
        difftree.wait()

    def cmd_index_filter(self, filelist_path, manage_gitattributes=False):
        filelist = set(f.strip() for f in open(filelist_path).readlines())
        lsfiles = subprocess.Popen(["git", "ls-files", "-s"], stdout=subprocess.PIPE)
        updateindex = subprocess.Popen(
            ["git", "update-index", "--index-info"], stdin=subprocess.PIPE
        )
        for line in lsfiles.stdout:
            line = touni(line)
            mode, sep, tail = line.partition(" ")
            blobhash, sep, tail = tail.partition(" ")
            stageno, sep, tail = tail.partition("\t")
            filename = tail.strip()
            if filename not in filelist:
                continue
            if mode == "120000":
                # skip symbolic links
                continue
            # This file will contain the hash of the cleaned object
            hashfile = os.path.join(self.gitdir, "fat", "index-filter", blobhash)
            try:
                cleanedobj = open(hashfile).read().rstrip()
            except IOError:
                catfile = subprocess.Popen(
                    ["git", "cat-file", "blob", blobhash], stdout=subprocess.PIPE
                )
                hashobject = subprocess.Popen(
                    ["git", "hash-object", "-w", "--stdin"],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                )

                def dofilter():
                    self.filter_clean(catfile.stdout, hashobject.stdin)
                    hashobject.stdin.close()

                filterclean = threading.Thread(target=dofilter)
                filterclean.start()
                cleanedobj = touni(hashobject.stdout.read()).rstrip()
                catfile.wait()
                hashobject.wait()
                filterclean.join()
                mkdir_p(os.path.dirname(hashfile))
                open(hashfile, "w").write(cleanedobj + "\n")
            updateindex.stdin.write(
                tobytes("%s %s %s\t%s\n" % (mode, cleanedobj, stageno, filename))
            )
        if manage_gitattributes:
            try:
                mode, blobsha1, stageno, filename = touni(
                    subprocess.check_output(["git", "ls-files", "-s", ".gitattributes"])
                ).split()
                gitattributes_lines = touni(
                    subprocess.check_output(["git", "cat-file", "blob", blobsha1])
                ).splitlines()
            except ValueError:  # Nothing to unpack, thus no file
                mode, stageno = "100644", "0"
                gitattributes_lines = []
            gitattributes_extra = [
                "%s filter.fat -text" % ln.split()[0] for ln in filelist
            ]
            hashobject = subprocess.Popen(
                ["git", "hash-object", "-w", "--stdin"],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
            )
            stdout, stderr = hashobject.communicate(
                b"\n".join(
                    tobytes(ln) for ln in gitattributes_lines + gitattributes_extra
                )
                + b"\n"
            )
            updateindex.stdin.write(
                tobytes(
                    "%s %s %s\t%s\n" % (mode, stdout.strip(), stageno, ".gitattributes")
                )
            )
        updateindex.stdin.close()
        lsfiles.wait()
        updateindex.wait()


def run_command(**kwargs):
    fn = kwargs.pop('func').replace("-", "_")
    gitfat = GitFat()
    if not hasattr(gitfat, fn):
        raise Exception("GitFat has no function called '%s'" % fn)
    getattr(gitfat, fn)(**kwargs)


if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        prog="git-fat",
        argument_default=argparse.SUPPRESS,
        description="A tool for managing large binary files in git repositories",
    )
    subparsers = parser.add_subparsers()

    # init
    parser_init = subparsers.add_parser("init", help="Initialize git-fat")
    parser_init.set_defaults(func="cmd_init")

    # filter-clean
    parser_fc = subparsers.add_parser("filter-clean", help="For Git internal use")
    parser_fc.set_defaults(func="cmd_filter_clean")

    # filter-smudge
    parser_fs = subparsers.add_parser("filter-smudge", help="For Git internal use")
    parser_fs.set_defaults(func="cmd_filter_smudge")

    # status
    parser_status = subparsers.add_parser(
        "status", help="Show tracked, orphan and stale objects"
    )
    parser_status.add_argument("--all", dest="show_all", action="store_true")
    parser_status.set_defaults(func="cmd_status")

    # push
    parser_push = subparsers.add_parser(
        "push", help="Push fat files to the git-fat remote server"
    )
    parser_push.add_argument("--all", dest="push_all", action="store_true")
    parser_push.set_defaults(func="cmd_push")

    # pull
    parser_pull = subparsers.add_parser(
        "pull", help="Pull fat files from the git-fat remote server"
    )
    parser_pull.add_argument("--all", dest="pull_all", action="store_true")
    parser_pull.add_argument("pull_args", nargs="*")
    parser_pull.set_defaults(func="cmd_pull")

    # gc
    parser_gc = subparsers.add_parser("gc", help="Remove local unreferenced objects")
    parser_gc.set_defaults(func="cmd_gc")

    # verify
    parser_verify = subparsers.add_parser(
        "verify", help="Show broken objects with invalid hash"
    )
    parser_verify.set_defaults(func="cmd_verify")

    # checkout
    parser_checkout = subparsers.add_parser("checkout", help="Restore orphaned objects")
    parser_checkout.set_defaults(func="cmd_checkout")

    # find
    parser_find = subparsers.add_parser(
        "find", help="Find all the objects larger than a threshold"
    )
    parser_find.add_argument("size_thresh", type=int, help="Threshold size in bytes")
    parser_find.set_defaults(func="cmd_find")

    # index-filter
    parser_idxfilter = subparsers.add_parser(
        "index-filter", help="git fat index-filter for filter-branch"
    )
    parser_idxfilter.add_argument(
        "filelist_path", help="Path to the file containing the files to filter"
    )
    parser_idxfilter.add_argument("--manage-gitattributes", action="store_true")
    parser_idxfilter.set_defaults(func="cmd_index_filter")

    args = parser.parse_args()
    kwargs = dict(vars(args))

    run_command(**kwargs)