From fc873d6547f21966a1767eb3028ae21d11b1d127 Mon Sep 17 00:00:00 2001 From: woytowitz Date: Sun, 23 Nov 2014 19:17:02 -0500 Subject: [PATCH 01/18] Cherry picks with modifications from Ozkan's jdsu-ger branch. Updates include Added support for wildcards in .gitattributes Added file name display in verbose mode during clean & smudge operations Fix handling of empty files in filter clean/smudge functions Creation of a "share" directory which allows for fine-grained push/pull and better disk utilization. Addition of pre-push,pre-rebase, and post-merge hooks to allow git push/pull to automatically call git fat push/pull commands as needed Added a help command test.sh and test-retroactive.sh scripts pass. --- git-fat | 357 +++++++++++++++++++++++++++++++++++--------- test-retroactive.sh | 4 +- test.sh | 2 + 3 files changed, 291 insertions(+), 72 deletions(-) diff --git a/git-fat b/git-fat index dd6af72..b504754 100755 --- a/git-fat +++ b/git-fat @@ -7,6 +7,8 @@ import sys import hashlib import tempfile import os +import fnmatch +import filecmp import subprocess import shlex import shutil @@ -124,6 +126,7 @@ def gitconfig_set(name, value, file=None): class GitFat(object): DecodeError = RuntimeError + ConfigError = RuntimeError def __init__(self): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore try: @@ -149,37 +152,71 @@ class GitFat(object): sys.stderr.write('fatal: git-fat is not yet configured in this repository.\n') sys.stderr.write('Run "git fat init" to configure.\n') sys.exit(1) - def get_rsync(self): - cfgpath = os.path.join(self.gitroot,'.gitfat') - remote = gitconfig_get('rsync.remote', file=cfgpath) - ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) - ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) - options = gitconfig_get('rsync.options', file=cfgpath) + self.fat_init_all() # Upgrade old git-fat setup to the latest one + def get_fat_config(self): + return os.path.join(self.gitroot,'.gitfat') + def get_fat_rsync_dirs(self): + cfgpath = self.get_fat_config() + remote = gitconfig_get('rsync.remote', file=cfgpath) if remote is None: - raise RuntimeError('No rsync.remote in %s' % cfgpath) - return remote, ssh_port, ssh_user, options - def get_rsync_command(self,push): - (remote, ssh_port, ssh_user, options) = self.get_rsync() - if push: - self.verbose('Pushing to %s' % (remote)) - else: - self.verbose('Pulling from %s' % (remote)) - + raise GitFat.ConfigError('No rsync.remote in %s' % cfgpath) + share = gitconfig_get('rsync.share', file=cfgpath) + if share is None: + share = gitconfig_get('rsync.local', file=cfgpath) + if share is None and os.path.exists(remote): + share = remote + if share is None: + share = self.objdir + return remote, share + def get_fat_rsync_ssh(self): + cfgpath = self.get_fat_config() + ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) + ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) + options = gitconfig_get('rsync.options', file=cfgpath) + return ssh_port, ssh_user, options + def get_rsync_command(self,src,dst,usessh=True): cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-'] - rshopts = '' - if ssh_user: - rshopts += ' -l ' + ssh_user - if ssh_port: - rshopts += ' -p ' + ssh_port - if rshopts: - cmd.append('--rsh=ssh' + rshopts) + (ssh_port, ssh_user, options) = self.get_fat_rsync_ssh() + if usessh: + rshopts = '' + if ssh_user: + rshopts += ' -l ' + ssh_user + if ssh_port: + rshopts += ' -p ' + ssh_port + if rshopts: + cmd.append('--rsh=ssh' + rshopts) if options: cmd += options.split(' ') + cmd += [src + '/', dst + '/'] + return cmd + def pushpull_to_rsync(self,push,cnt): + (remote, share) = self.get_fat_rsync_dirs() if push: - cmd += [self.objdir + '/', remote + '/'] + src = self.objdir + dst = remote + self.verbose('git-fat pushpull_to_rsync: %d file(s) found to push to %s' % (cnt, remote)) else: - cmd += [remote + '/', self.objdir + '/'] - return cmd + src = remote + dst = share # If share is set up, smudge filter will take care of linking self.objdir to share during merge|rebase step of 'pull', therefore always pull from remote to share here. + self.verbose('git-fat pushpull_to_rsync: %d file(s) found to pull from %s' % (cnt, remote)) + return self.get_rsync_command(src, dst) + def symlink_to_share(self, digest): + 'Create self.objdir/digest (links) pointing at share/digest if the configuration of share is set up appropriately' + (remote, share) = self.get_fat_rsync_dirs() + if share == self.objdir or not os.path.exists(share): # Do nothing if share is not set up or points at a non-existing path. + return + sharefile = os.path.join(share, digest) + objfile = os.path.join(self.objdir, digest) + if os.path.lexists(objfile): + os.remove(objfile) + os.symlink(sharefile, objfile) # Note that sharefile may not exist, i.e. may be creating a broken symlink. It is OK as we may not have pulled from remote (to share) yet. + def convert_digest_to_symlink(self, files, share): + 'Replace self.objdir/digest in files with links pointing at share/digest' + for digest in files: + fat = os.path.join(self.objdir, digest) + sharefile = os.path.join(share, digest) + os.remove(fat) + os.symlink(sharefile, fat) def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode_v1(self, digest, bytes): @@ -213,7 +250,7 @@ class GitFat(object): stat = os.lstat(fname) except OSError: return False, None - if stat.st_size != self.magiclen: + if stat.st_size not in self.magiclens: return False, None # read file try: @@ -233,7 +270,7 @@ class GitFat(object): ''' digest, bytes = self.decode(body, noraise=True) return digest - def filter_clean(self, instream, outstreamclean): + def filter_clean(self, instream, outstreamclean, args): h = hashlib.new('sha1') bytes = 0 fd, tmpname = tempfile.mkstemp(dir=self.objdir) @@ -254,44 +291,54 @@ class GitFat(object): bytes += len(block) outstream.write(block) outstream.flush() - digest = h.hexdigest() - objfile = os.path.join(self.objdir, digest) - if not ishanging: - if os.path.exists(objfile): - self.verbose('git-fat filter-clean: cache already exists %s' % objfile) - os.remove(tmpname) - else: - # Set permissions for the new file using the current umask - os.chmod(tmpname, int('444', 8) & ~umask()) - os.rename(tmpname, objfile) - self.verbose('git-fat filter-clean: caching to %s' % objfile) - cached = True - outstreamclean.write(self.encode(digest, bytes)) + # Skip empty files + if bytes != 0: + digest = h.hexdigest() + objfile = os.path.join(self.objdir, digest) + if not ishanging: + if os.path.exists(objfile): + self.verbose('git-fat filter-clean: cache already exists %s (referenced by %s)' % (objfile, str(args[0]))) + os.remove(tmpname) + else: + # Set permissions for the new file using the current umask + os.chmod(tmpname, int('444', 8) & ~umask()) + os.rename(tmpname, objfile) + self.verbose('git-fat filter-clean: caching to %s' % objfile) + cached = True + outstreamclean.write(self.encode(digest, bytes)) finally: if not cached: os.remove(tmpname) - def cmd_filter_clean(self): + def cmd_filter_clean(self, args): ''' The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' self.setup() - self.filter_clean(sys.stdin, sys.stdout) + self.filter_clean(sys.stdin, sys.stdout, args) - def cmd_filter_smudge(self): + def cmd_filter_smudge(self, args): self.setup() + filename = str(args[0]) result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = os.path.join(self.objdir, result) + if not os.access(objfile, os.R_OK): + self.verbose('git-fat filter-smudge: fat object missing %s (required by %s) - will query share, if available, and try again' % (objfile, filename)) + self.symlink_to_share(result) + if not os.access(objfile, os.R_OK): + self.verbose('git-fat filter-smudge: fat object missing %s (required by %s) - will query remote, if available, and try again' % (objfile, filename)) + self.pull_from_remote(set([result])) try: cat(open(objfile), sys.stdout) - self.verbose('git-fat filter-smudge: restoring from %s' % objfile) + self.verbose('git-fat filter-smudge: restoring from %s (referenced by %s)' % (objfile, filename)) except IOError: # file not found - self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) + self.verbose('git-fat filter-smudge: fat object missing %s (required by %s)' % (objfile, filename)) sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file - else: # We have an iterable over the original input. - self.verbose('git-fat filter-smudge: not a managed file') + # We have an non empty iterable over the original input. + elif len(next(result)) != 0: + self.verbose('git-fat filter-smudge: not a managed file (%s)' % filename) cat_iter(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) @@ -365,6 +412,13 @@ class GitFat(object): if digest: yield (digest, fname) + def fat_files(self): + fatfiles = set() + for fatfile in self.catalog_objects(): + if fatfile != '' and not os.path.islink(os.path.join(self.objdir, fatfile)): + fatfiles.add(fatfile) + return fatfiles + def cmd_status(self, args): self.setup() catalog = self.catalog_objects() @@ -387,24 +441,58 @@ class GitFat(object): print(' ' + g) def is_dirty(self): return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0 - def cmd_push(self, args): - 'Push anything that I have stored and referenced' - self.setup() - # Default to push only those objects referenced by current HEAD - # (includes history). Finer-grained pushing would be useful. - pushall = '--all' in args - files = self.referenced_objects(all=pushall) & self.catalog_objects() - cmd = self.get_rsync_command(push=True) + + def push_to_remote(self, files): + if len(files) == 0: + return + cmd = self.pushpull_to_rsync(push=True, cnt=len(files)) self.verbose('Executing: %s' % ' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) if p.returncode: sys.exit(p.returncode) + def push_to_share(self, files): + (remote, share) = self.get_fat_rsync_dirs() + # Do nothing if share is not set up or points at a non-existing path. + if share == self.objdir or not os.path.exists(share): + return + if len(files) == 0: + return + cmd = self.get_rsync_command(self.objdir, share, usessh=False) # ssh parameters do not apply to share. They are for remote only. + self.verbose('git-fat push to share: Executing: %s' % ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input='\x00'.join(files)) + if p.returncode: + sys.exit(p.returncode) + self.convert_digest_to_symlink(files, share) + def git_remote_exists(self): + args = ['git', 'remote'] + p = subprocess.Popen(args, stdout=subprocess.PIPE) + output = p.communicate()[0].strip() + if p.returncode or not output: + return False + else: + return True + def cmd_pre_push(self, args): + if self.git_remote_exists(): + self.cmd_push("") + def cmd_push(self, args): + 'Push all fat files that I have stored and referenced' + self.setup() + # Default to push only those objects referenced by current HEAD (includes history) + pushall = '--all' in args + files = self.referenced_objects(all=pushall) & self.fat_files() + self.push_to_remote(files) + self.push_to_share(files) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' self.assert_init_done() for digest, fname in self.orphan_files(): objpath = os.path.join(self.objdir, digest) + if not os.access(objpath, os.R_OK): + self.symlink_to_share(digest) + if not os.access(objpath, os.R_OK): + self.pull_from_remote(set([digest])) if os.access(objpath, os.R_OK): print('Restoring %s -> %s' % (digest, fname)) # The output of our smudge filter depends on the existence of @@ -418,6 +506,22 @@ class GitFat(object): subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname]) elif show_orphans: print('Data unavailable: %s %s' % (digest,fname)) + def pull_from_remote(self, files): + 'Since this sub is also used by cmd_filter_smudge, stdout needs to be nothing but what git expects => throw away stdout of rsync' + if len(files) == 0: + return + cmd = self.pushpull_to_rsync(push=False, cnt=len(files)) + self.verbose('git-fat pull: Executing: %s' % ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + stdoutdata = p.communicate(input='\x00'.join(files)) + if p.returncode: + sys.exit(p.returncode) + def cmd_post_merge(self, args): + if self.git_remote_exists(): + self.cmd_pull("") + def cmd_pre_rebase(self, args): + if self.git_remote_exists(): + self.cmd_pull("") def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() @@ -431,12 +535,7 @@ class GitFat(object): if rev: refargs['rev'] = rev files = self.filter_objects(refargs, self.parse_pull_patterns(args)) - cmd = self.get_rsync_command(push=False) - self.verbose('Executing: %s' % ' '.join(cmd)) - p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - p.communicate(input='\x00'.join(files)) - if p.returncode: - sys.exit(p.returncode) + self.pull_from_remote(files) self.checkout() def parse_pull_patterns(self, args): @@ -483,13 +582,47 @@ class GitFat(object): print('%s data hash is %s' % (obj, data_hash)) sys.exit(1) + def fat_init_one(self, var, value): + value_cur = gitconfig_get(var) + if value_cur is None or value_cur != value: + gitconfig_set(var, value) + return True + return False + def fat_init_all(self): + ret = False + + post_merge = os.path.join(self.gitdir, 'hooks', 'post-merge') + if not os.path.isfile(post_merge): + with open(post_merge, "w") as f: + lines = ["#!/bin/sh -ex\n", "git fat post-merge \"$@\"\n"] + f.writelines(lines) + os.chmod(post_merge, 0755) + ret = True + + pre_rebase = os.path.join(self.gitdir, 'hooks', 'pre-rebase') + if not os.path.isfile(pre_rebase): + with open(pre_rebase, "w") as f: + lines = ["#!/bin/sh -ex\n", "git fat pre-rebase \"$@\"\n"] + f.writelines(lines) + os.chmod(pre_rebase, 0755) + ret = True + + pre_push = os.path.join(self.gitdir, 'hooks', 'pre-push') + if not os.path.isfile(pre_push): + with open(pre_push, "w") as f: + lines = ["#!/bin/sh -ex\n", "git fat pre-push \"$@\"\n"] + f.writelines(lines) + os.chmod(pre_push, 0755) + ret = True + + ret = self.fat_init_one('filter.fat.clean', 'git-fat filter-clean %f') or ret + ret = self.fat_init_one('filter.fat.smudge', 'git-fat filter-smudge %f') or ret + ret = self.fat_init_one('filter.fat.required', 'true') or ret + return ret def cmd_init(self): self.setup() - if self.is_init_done(): - print('Git fat already configured, check configuration in .git/config') - else: - gitconfig_set('filter.fat.clean', 'git-fat filter-clean') - gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') + if self.fat_init_all() is True: + self.cmd_post_merge("") print('Initialized git fat') def gen_large_blobs(self, revs, threshsize): """Build dict of all blobs""" @@ -551,7 +684,12 @@ class GitFat(object): blobhash, sep, tail = tail.partition(' ') stageno, sep, tail = tail.partition('\t') filename = tail.strip() - if filename not in filelist: + infilelist = False + for pattern in filelist: + if fnmatch.fnmatch(filename, pattern): + infilelist = True + break + if not infilelist: continue if mode == "120000": # skip symbolic links @@ -564,7 +702,7 @@ class GitFat(object): catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE) hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def dofilter(): - self.filter_clean(catfile.stdout, hashobject.stdin) + self.filter_clean(catfile.stdout, hashobject.stdin, filename) hashobject.stdin.close() filterclean = threading.Thread(target=dofilter) filterclean.start() @@ -590,14 +728,89 @@ class GitFat(object): lsfiles.wait() updateindex.wait() + def cmd_help(self): + objdir = os.path.join(self.gitroot, self.objdir) + # Directories + print('Directories used by git-fat:') + print('- objdir : Contains fat objects(files and/or shared links). Shared links will only exist if \'share\' is configured.') + print(' (' + objdir + ')') + try: + (remote, share) = self.get_fat_rsync_dirs() + except GitFat.ConfigError: + (remote, share) = ('', objdir) + pass + print('- share : Directory containing pushed out fat files.') + print(' To increase performance, you are recommended to set this spot to be on a local NAS on your site.') + print(' This directory is shared across all your repos/wcps as well as by your peers if they are using it.') + print(' Setting up this directory offers disk space saving as well as allows fine grained push operation => faster push performance.') + print(' If this configuration option is not set up, its value defaults to remote if it is a directory or \'objdir\'.') + print(' (' + share + ')') + print('- remote : Rsync destination containing pushed out fat files.') + print(' This rsync destination is where everyone who uses this repo pushes their fat files onto.') + print(' (' + remote + ')') + print('share and remote are configured via ' + self.get_fat_config()) + print() + # Definitions + print('Definitions used by git-fat:') + print('- reference objects : List of all fat objects referenced by your working copy. These named files are expected to exist in \'objdir\'.') + print('- catalog objects : List of all fat objects in \'objdir\'') + print('- orphan objects : reference - catalog (subtraction)') + print('- garbage objects : catalog - reference (subtraction)') + print() + # Operation + print('Two primary functions of git-fat are clean and smudge filters that git invokes as necessary:') + print('- filter-clean : (large) file content (input) => translated (small) reference file (output)') + print('- Creates the fat object in \'objdir/...\' using the (large) file content. Its name is based on its SHA1.') + print('- filter-smudge : (small) reference file (stdin) => recovered (large) file content (stdout)') + print(' Creates a shared link: \'objdir/...\' -> \'share/...\' for the (large) file (name is based on its SHA1). Bypassed if \'objdir/...\' already exists.') + print(' If \'objdir/...\' is broken, it brings in the (large) file from \'remote\' to \'share\' ==> recovers the file.') + print('') + print('Additional useful functions offered by git-fat are:') + print('- git fat status : Prints orphan and garbage objects') + print('- git fat checkout : Converts all orphan objects into non-orphan state, while automatically executing \'pull\'-like functionality for the specific orphan file.') + print('- git fat gc : Deletes all garbage objects') + print('- git fat verify : Report corrupt fat objects in the catalog') + print('- More info? : Define export var GIT_FAT_VERBOSE and continue using git-fat.') + print('') + print('Typical git operations, when is git-fat involved and what it does when it is invoked:') + print('- git clone ... : See git checkout.') + print('- git fetch : git-fat is not involved.') + print('- git fat pull : Runs git pull') + print(' Brings in data for orphan objects, computed per HEAD (including history) of your working copy, from \'remote\' to \'share\'.') + print(' Creates a sym link: \'objdir/...\' -> \'share/...\' for each orphan object that HEAD points at (no history) ==> No longer orphan.') + print(' Lets git invoke git-fat\'s filter-smudge function') + print('- git fat pull --all : Same as git fat pull except that the orphan objects are computed across all git objects,') + print(' not just per what HEAD (including history) of your working copy.') + print('- git fat push : reference & fat files (not sym links), where & is the intersection operation, is pushed out to:') + print(' - \'remote\'. Diff the same file set between \'objdir\' and \'remote\'. Abort if mismatches.') + print(' - \'share\'. Diff the same file set between \'objdir\' and \'share\'. Abort if mismatches.') + print(' Replaces each such file in \'objdir\' with a sym link, pointing at \'share/...\'.') + print(' Runs git push') + print('- git fat push --all : Same steps as git fat push except that reference is computed across all git objects,') + print(' not just what your HEAD (including history) is pointing at.') + print('') + print('- git checkout ... : git invokes git-fat filter-smudge for each file .gitattributes indicates so.') + print('- git add : git invokes git-fat filter-clean if .gitattributes has a matching line for .') + print('- git commit -a [...] : See git add.') + print('- git merge ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') + print('- git rebase ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') + print('- git cherry-pick ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') + print('- git revert ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') + if __name__ == '__main__': fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': - fat.cmd_filter_clean() + fat.cmd_filter_clean(sys.argv[2:]) elif cmd == 'filter-smudge': - fat.cmd_filter_smudge() + fat.cmd_filter_smudge(sys.argv[2:]) + elif cmd == 'pre-push': + fat.cmd_pre_push(sys.argv[2:]) + elif cmd == 'pre-rebase': + fat.cmd_pre_rebase(sys.argv[2:]) + elif cmd == 'post-merge': + fat.cmd_post_merge(sys.argv[2:]) elif cmd == 'init': fat.cmd_init() elif cmd == 'status': @@ -616,5 +829,7 @@ if __name__ == '__main__': fat.cmd_find(sys.argv[2:]) elif cmd == 'index-filter': fat.cmd_index_filter(sys.argv[2:]) + elif cmd == 'help': + fat.cmd_help() else: - print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter]', file=sys.stderr) + print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter|help]', file=sys.stderr) diff --git a/test-retroactive.sh b/test-retroactive.sh index 51a38ec..6ed5eb0 100755 --- a/test-retroactive.sh +++ b/test-retroactive.sh @@ -1,7 +1,9 @@ #!/bin/sh -ex +export GIT_FAT_VERBOSE=1 fullpath() { echo "`pwd`/$1"; } +rm -rf retro retro-clone retro-store git init retro cd retro cp /usr/share/dict/words words.big @@ -50,4 +52,4 @@ cd .. git clone file:///$(fullpath retro) retro-clone cd retro-clone git fat init -git fat pull +git pull diff --git a/test.sh b/test.sh index 0ee63ea..1c4dbc2 100755 --- a/test.sh +++ b/test.sh @@ -2,6 +2,8 @@ # Any copyright is dedicated to the Public Domain. # http://creativecommons.org/publicdomain/zero/1.0/ +export GIT_FAT_VERBOSE=1 + # Clear out repos and fat store from prior test runs rm -fR fat-test fat-test2 /tmp/fat-store From 94fbe41ed54fd1472743a1dff23313161ae10fce Mon Sep 17 00:00:00 2001 From: woytowitz Date: Mon, 24 Nov 2014 19:44:58 -0500 Subject: [PATCH 02/18] Test for more fine grained @{upstream} being set instead of whether any remote is set. --- git-fat | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/git-fat b/git-fat index b504754..fd7918d 100755 --- a/git-fat +++ b/git-fat @@ -465,8 +465,8 @@ class GitFat(object): if p.returncode: sys.exit(p.returncode) self.convert_digest_to_symlink(files, share) - def git_remote_exists(self): - args = ['git', 'remote'] + def git_upstream_exists(self): + args = ['git', 'name-rev', '@{upstream}'] p = subprocess.Popen(args, stdout=subprocess.PIPE) output = p.communicate()[0].strip() if p.returncode or not output: @@ -474,7 +474,7 @@ class GitFat(object): else: return True def cmd_pre_push(self, args): - if self.git_remote_exists(): + if self.git_upstream_exists(): self.cmd_push("") def cmd_push(self, args): 'Push all fat files that I have stored and referenced' @@ -517,10 +517,10 @@ class GitFat(object): if p.returncode: sys.exit(p.returncode) def cmd_post_merge(self, args): - if self.git_remote_exists(): + if self.git_upstream_exists(): self.cmd_pull("") def cmd_pre_rebase(self, args): - if self.git_remote_exists(): + if self.git_upstream_exists(): self.cmd_pull("") def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' From 1d7911c4556d4d60fd98618df0777012ef99ac8e Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Thu, 27 Nov 2014 15:24:28 -0500 Subject: [PATCH 03/18] Ignore stderr when checking for upstream --- git-fat | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/git-fat b/git-fat index fd7918d..35efea0 100755 --- a/git-fat +++ b/git-fat @@ -467,12 +467,13 @@ class GitFat(object): self.convert_digest_to_symlink(files, share) def git_upstream_exists(self): args = ['git', 'name-rev', '@{upstream}'] - p = subprocess.Popen(args, stdout=subprocess.PIPE) - output = p.communicate()[0].strip() - if p.returncode or not output: - return False - else: - return True + with open(os.devnull, 'w') as devnull: + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=devnull) + output = p.communicate()[0].strip() + if p.returncode or not output: + return False + else: + return True def cmd_pre_push(self, args): if self.git_upstream_exists(): self.cmd_push("") From 8a4d4b74fd7ea27d2fea2c4d0ebe3cd6fd62ac02 Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Thu, 27 Nov 2014 13:18:21 -0500 Subject: [PATCH 04/18] Revert to make work with Python 2.7 which is what is on my system Revert "Fix syntax and error cleanly on Python-3" This reverts commit 26b6b225b60a57506115d0a707bded11e5a245ff. --- git-fat | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/git-fat b/git-fat index 35efea0..bef16ca 100755 --- a/git-fat +++ b/git-fat @@ -17,10 +17,6 @@ import threading import time import collections -if not type(sys.version_info) is tuple and sys.version_info.major > 2: - sys.stderr.write('git-fat does not support Python-3 yet. Please use python2.\n') - sys.exit(1) - try: from subprocess import check_output del check_output @@ -671,7 +667,7 @@ class GitFat(object): time1 = time.time() self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) maxlen = max(map(len,pathsizes)) if pathsizes else 0 - for path, sizes in sorted(pathsizes.items(), key=lambda p,s: max(s), reverse=True): + for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True): print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) revlist.wait() difftree.wait() From 80444c3bcf945516d6dec7d54eb4a89301c9b6cf Mon Sep 17 00:00:00 2001 From: woytowitz Date: Fri, 28 Nov 2014 13:41:11 -0500 Subject: [PATCH 05/18] Add the filename to fat object names to allow easier assocation between fat objects and files. The objfile names are now "hash.fname" --- git-fat | 76 ++++++++++++++++++++++++++++++++++++--------------------- test.sh | 9 ++++--- 2 files changed, 53 insertions(+), 32 deletions(-) diff --git a/git-fat b/git-fat index bef16ca..4083b8e 100755 --- a/git-fat +++ b/git-fat @@ -7,6 +7,7 @@ import sys import hashlib import tempfile import os +import Queue import fnmatch import filecmp import subprocess @@ -154,13 +155,9 @@ class GitFat(object): def get_fat_rsync_dirs(self): cfgpath = self.get_fat_config() remote = gitconfig_get('rsync.remote', file=cfgpath) - if remote is None: - raise GitFat.ConfigError('No rsync.remote in %s' % cfgpath) share = gitconfig_get('rsync.share', file=cfgpath) if share is None: share = gitconfig_get('rsync.local', file=cfgpath) - if share is None and os.path.exists(remote): - share = remote if share is None: share = self.objdir return remote, share @@ -202,7 +199,7 @@ class GitFat(object): if share == self.objdir or not os.path.exists(share): # Do nothing if share is not set up or points at a non-existing path. return sharefile = os.path.join(share, digest) - objfile = os.path.join(self.objdir, digest) + objfile = os.path.join(self.objdir, digest) if os.path.lexists(objfile): os.remove(objfile) os.symlink(sharefile, objfile) # Note that sharefile may not exist, i.e. may be creating a broken symlink. It is OK as we may not have pulled from remote (to share) yet. @@ -266,7 +263,7 @@ class GitFat(object): ''' digest, bytes = self.decode(body, noraise=True) return digest - def filter_clean(self, instream, outstreamclean, args): + def filter_clean(self, instream, outstreamclean, fname): h = hashlib.new('sha1') bytes = 0 fd, tmpname = tempfile.mkstemp(dir=self.objdir) @@ -291,9 +288,10 @@ class GitFat(object): if bytes != 0: digest = h.hexdigest() objfile = os.path.join(self.objdir, digest) + objfile += '.' + fname if not ishanging: if os.path.exists(objfile): - self.verbose('git-fat filter-clean: cache already exists %s (referenced by %s)' % (objfile, str(args[0]))) + self.verbose('git-fat filter-clean: cache already exists %s' % objfile) os.remove(tmpname) else: # Set permissions for the new file using the current umask @@ -306,35 +304,35 @@ class GitFat(object): if not cached: os.remove(tmpname) - def cmd_filter_clean(self, args): + def cmd_filter_clean(self, fname): ''' The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' self.setup() - self.filter_clean(sys.stdin, sys.stdout, args) + self.filter_clean(sys.stdin, sys.stdout, fname) - def cmd_filter_smudge(self, args): + def cmd_filter_smudge(self, fname): self.setup() - filename = str(args[0]) result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest + result += '.' + fname; objfile = os.path.join(self.objdir, result) if not os.access(objfile, os.R_OK): - self.verbose('git-fat filter-smudge: fat object missing %s (required by %s) - will query share, if available, and try again' % (objfile, filename)) + self.verbose('git-fat filter-smudge: fat object missing %s - will query share, if available, and try again' % objfile) self.symlink_to_share(result) if not os.access(objfile, os.R_OK): - self.verbose('git-fat filter-smudge: fat object missing %s (required by %s) - will query remote, if available, and try again' % (objfile, filename)) + self.verbose('git-fat filter-smudge: fat object missing %s - will query remote, if available, and try again' % objfile) self.pull_from_remote(set([result])) try: cat(open(objfile), sys.stdout) - self.verbose('git-fat filter-smudge: restoring from %s (referenced by %s)' % (objfile, filename)) + self.verbose('git-fat filter-smudge: restoring from %s' % objfile) except IOError: # file not found - self.verbose('git-fat filter-smudge: fat object missing %s (required by %s)' % (objfile, filename)) + self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file # We have an non empty iterable over the original input. elif len(next(result)) != 0: - self.verbose('git-fat filter-smudge: not a managed file (%s)' % filename) + self.verbose('git-fat filter-smudge: not a managed file (%s)' % fname) cat_iter(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) @@ -344,12 +342,19 @@ class GitFat(object): rev = '--all' elif rev is None: rev = self.revparse('HEAD') + + # Queue for exchanging hash/fname pairs between threads + _sentinel = object() + queue = Queue.Queue() + # Revision list gives us object names to inspect with cat-file... p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) - def cut_sha1hash(input, output): + def cut_sha1hash(input, output, queue): for line in input: output.write(line.split()[0] + '\n') + queue.put(line) output.close() + queue.put(_sentinel) # ...`cat-file --batch-check` filters for git-fat object candidates in bulk... p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def filter_gitfat_candidates(input, output): @@ -361,7 +366,7 @@ class GitFat(object): # ...`cat-file --batch` provides full contents of git-fat candidates in bulk p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) # Stream data: p1 | cut_thread | p2 | filter_thread | p3 - cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) + cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin, queue)) filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)) cut_thread.start() filter_thread.start() @@ -383,6 +388,15 @@ class GitFat(object): bytes_read += len(data) try: fathash = self.decode(content)[0] + while True: + data = queue.get() + list = str(data).split() + if objhash == list[0]: + fathash += '.' + list[1] + break + if data is _sentinel: + queue.put(_sentinel) + break referenced.add(fathash) except GitFat.DecodeError: pass @@ -463,19 +477,21 @@ class GitFat(object): self.convert_digest_to_symlink(files, share) def git_upstream_exists(self): args = ['git', 'name-rev', '@{upstream}'] - with open(os.devnull, 'w') as devnull: - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=devnull) - output = p.communicate()[0].strip() - if p.returncode or not output: - return False - else: - return True + p = subprocess.Popen(args, stdout=subprocess.PIPE) + output = p.communicate()[0].strip() + if p.returncode or not output: + return False + else: + return True def cmd_pre_push(self, args): if self.git_upstream_exists(): self.cmd_push("") def cmd_push(self, args): 'Push all fat files that I have stored and referenced' self.setup() + (remote, share) = self.get_fat_rsync_dirs() + if remote is None: + return # Default to push only those objects referenced by current HEAD (includes history) pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.fat_files() @@ -485,6 +501,7 @@ class GitFat(object): 'Update any stale files in the present working tree' self.assert_init_done() for digest, fname in self.orphan_files(): + digest += '.' + fname; objpath = os.path.join(self.objdir, digest) if not os.access(objpath, os.R_OK): self.symlink_to_share(digest) @@ -522,6 +539,9 @@ class GitFat(object): def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() + (remote, share) = self.get_fat_rsync_dirs() + if remote is None: + return refargs = dict() if '--all' in args: refargs['all'] = True @@ -740,7 +760,7 @@ class GitFat(object): print(' To increase performance, you are recommended to set this spot to be on a local NAS on your site.') print(' This directory is shared across all your repos/wcps as well as by your peers if they are using it.') print(' Setting up this directory offers disk space saving as well as allows fine grained push operation => faster push performance.') - print(' If this configuration option is not set up, its value defaults to remote if it is a directory or \'objdir\'.') + print(' If this configuration option is not set up, its value defaults to \'objdir\'.') print(' (' + share + ')') print('- remote : Rsync destination containing pushed out fat files.') print(' This rsync destination is where everyone who uses this repo pushes their fat files onto.') @@ -799,9 +819,9 @@ if __name__ == '__main__': fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': - fat.cmd_filter_clean(sys.argv[2:]) + fat.cmd_filter_clean(sys.argv[2]) elif cmd == 'filter-smudge': - fat.cmd_filter_smudge(sys.argv[2:]) + fat.cmd_filter_smudge(sys.argv[2]) elif cmd == 'pre-push': fat.cmd_pre_push(sys.argv[2:]) elif cmd == 'pre-rebase': diff --git a/test.sh b/test.sh index 1c4dbc2..76484d8 100755 --- a/test.sh +++ b/test.sh @@ -6,6 +6,7 @@ export GIT_FAT_VERBOSE=1 # Clear out repos and fat store from prior test runs rm -fR fat-test fat-test2 /tmp/fat-store +mkdir -p /tmp/fat-store git init fat-test cd fat-test @@ -57,10 +58,10 @@ rm d git fat pull # Check verify command finds corrupt object -mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \ - .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak +mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat \ + .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat.bak echo "Not the right data" > .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 git fat verify && true if [ $? -eq 0 ]; then echo "Verify did not detect invalid object"; exit 1; fi -mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \ - .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 +mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat.bak \ + .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat From 500adfdb768011ee50b7519e05e0335e752dd36f Mon Sep 17 00:00:00 2001 From: woytowitz Date: Fri, 28 Nov 2014 15:28:05 -0500 Subject: [PATCH 06/18] Update previous commit so that is works with files in subdirectories --- git-fat | 13 +++++++------ test.sh | 13 +++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/git-fat b/git-fat index 4083b8e..3d0412d 100755 --- a/git-fat +++ b/git-fat @@ -202,6 +202,7 @@ class GitFat(object): objfile = os.path.join(self.objdir, digest) if os.path.lexists(objfile): os.remove(objfile) + mkdir_p(os.path.dirname(objfile)) os.symlink(sharefile, objfile) # Note that sharefile may not exist, i.e. may be creating a broken symlink. It is OK as we may not have pulled from remote (to share) yet. def convert_digest_to_symlink(self, files, share): 'Replace self.objdir/digest in files with links pointing at share/digest' @@ -287,8 +288,7 @@ class GitFat(object): # Skip empty files if bytes != 0: digest = h.hexdigest() - objfile = os.path.join(self.objdir, digest) - objfile += '.' + fname + objfile = os.path.join(self.objdir, fname + '.' + digest) if not ishanging: if os.path.exists(objfile): self.verbose('git-fat filter-clean: cache already exists %s' % objfile) @@ -296,6 +296,7 @@ class GitFat(object): else: # Set permissions for the new file using the current umask os.chmod(tmpname, int('444', 8) & ~umask()) + mkdir_p(os.path.dirname(objfile)) os.rename(tmpname, objfile) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True @@ -316,7 +317,7 @@ class GitFat(object): self.setup() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest - result += '.' + fname; + result = fname + '.' + result; objfile = os.path.join(self.objdir, result) if not os.access(objfile, os.R_OK): self.verbose('git-fat filter-smudge: fat object missing %s - will query share, if available, and try again' % objfile) @@ -392,7 +393,7 @@ class GitFat(object): data = queue.get() list = str(data).split() if objhash == list[0]: - fathash += '.' + list[1] + fathash = list[1] + '.' + fathash break if data is _sentinel: queue.put(_sentinel) @@ -417,7 +418,7 @@ class GitFat(object): def orphan_files(self, patterns=[]): 'generator for all orphan placeholders in the working tree' - for fname in subprocess.check_output(['git', 'ls-files', '-z'] + patterns).split('\x00')[:-1]: + for fname in subprocess.check_output(['git', 'ls-files', '-z'] + patterns).split('\x00'): digest = self.decode_file(fname)[0] if digest: yield (digest, fname) @@ -501,7 +502,7 @@ class GitFat(object): 'Update any stale files in the present working tree' self.assert_init_done() for digest, fname in self.orphan_files(): - digest += '.' + fname; + digest = fname + '.' + digest; objpath = os.path.join(self.objdir, digest) if not os.access(objpath, os.R_OK): self.symlink_to_share(digest) diff --git a/test.sh b/test.sh index 76484d8..2f77817 100755 --- a/test.sh +++ b/test.sh @@ -13,7 +13,8 @@ cd fat-test git fat init cat - >> .gitfat < .gitattributes git add .gitattributes .gitfat @@ -58,10 +59,10 @@ rm d git fat pull # Check verify command finds corrupt object -mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat \ - .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat.bak -echo "Not the right data" > .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 +mv .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \ + .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak +echo "Not the right data" > .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 git fat verify && true if [ $? -eq 0 ]; then echo "Verify did not detect invalid object"; exit 1; fi -mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat.bak \ - .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.b.fat +mv .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \ + .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 From f8cc1ca6765e001b94c108cd5167c476f947992c Mon Sep 17 00:00:00 2001 From: woytowitz Date: Fri, 28 Nov 2014 19:02:51 -0500 Subject: [PATCH 07/18] Fixed some bugs with new full path scheme for fat objects --- git-fat | 4 ++-- test-retroactive.sh | 7 ++++--- test.sh | 15 ++++++++------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/git-fat b/git-fat index 3d0412d..afd3694 100755 --- a/git-fat +++ b/git-fat @@ -336,7 +336,7 @@ class GitFat(object): self.verbose('git-fat filter-smudge: not a managed file (%s)' % fname) cat_iter(result, sys.stdout) def catalog_objects(self): - return set(os.listdir(self.objdir)) + return set([os.path.join(dp[len(self.objdir)+1:], f) for dp, dn, filenames in os.walk(self.objdir) for f in filenames]) def referenced_objects(self, rev=None, all=False): referenced = set() if all: @@ -457,7 +457,7 @@ class GitFat(object): if len(files) == 0: return cmd = self.pushpull_to_rsync(push=True, cnt=len(files)) - self.verbose('Executing: %s' % ' '.join(cmd)) + self.verbose('git-fat push to remote Executing: %s' % ' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) if p.returncode: diff --git a/test-retroactive.sh b/test-retroactive.sh index 6ed5eb0..eeeeb7a 100755 --- a/test-retroactive.sh +++ b/test-retroactive.sh @@ -10,8 +10,9 @@ cp /usr/share/dict/words words.big chmod u+w words.big git add words.big git commit -m'Add big file without using git-fat' -sort words.big > sorted.big -git add sorted.big +mkdir sub +sort words.big > sub/sorted.big +git add sub/sorted.big git commit -m'Add sorted file without using git-fat' cat > .gitattributes < a.fat git add a.fat git commit -m'add a.fat' -echo 'fat content b' > b.fat -git add b.fat +mkdir sub +echo 'fat content b' > sub/b.fat +git add sub/b.fat git commit -m'add b.fat' echo 'revise fat content a' > a.fat git commit -am'revise a.fat' @@ -59,10 +60,10 @@ rm d git fat pull # Check verify command finds corrupt object -mv .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \ - .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak -echo "Not the right data" > .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 +mv .git/fat/objects/sub/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \ + .git/fat/objects/sub/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak +echo "Not the right data" > .git/fat/objects/sub/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 git fat verify && true if [ $? -eq 0 ]; then echo "Verify did not detect invalid object"; exit 1; fi -mv .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \ - .git/fat/objects/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 +mv .git/fat/objects/sub/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \ + .git/fat/objects/sub/b.fat.6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 From 2c8926c6ccbc1f69c162743fc1d121b297022c76 Mon Sep 17 00:00:00 2001 From: woytowitz Date: Fri, 28 Nov 2014 21:10:12 -0500 Subject: [PATCH 08/18] Fix git fat verify so it works with full path objdir files. --- git-fat | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/git-fat b/git-fat index afd3694..a7d2361 100755 --- a/git-fat +++ b/git-fat @@ -291,7 +291,6 @@ class GitFat(object): objfile = os.path.join(self.objdir, fname + '.' + digest) if not ishanging: if os.path.exists(objfile): - self.verbose('git-fat filter-clean: cache already exists %s' % objfile) os.remove(tmpname) else: # Set permissions for the new file using the current umask @@ -592,7 +591,7 @@ class GitFat(object): for block in readblocks(open(fname)): h.update(block) data_hash = h.hexdigest() - if obj != data_hash: + if not obj.endswith(data_hash): corrupted_objects.append((obj, data_hash)) if corrupted_objects: print('Corrupted objects: %d' % len(corrupted_objects)) From 88bbfaa07e3b9c1e69312e40bcb1c4fd0d378410 Mon Sep 17 00:00:00 2001 From: woytowitz Date: Fri, 28 Nov 2014 21:35:37 -0500 Subject: [PATCH 09/18] Default git fat find arg to 0 if not present. Fixup find command to return proper # of large blobs found --- git-fat | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/git-fat b/git-fat index a7d2361..8e62255 100755 --- a/git-fat +++ b/git-fat @@ -655,7 +655,7 @@ class GitFat(object): objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin)) hashonly.start() - numblobs = 0; numlarge = 1 + numblobs = 0; numlarge = 0 # Build dict with the sizes of all large blobs for line in objcheck.stdout: objhash, blob, size = line.split() @@ -672,8 +672,10 @@ class GitFat(object): time1 = time.time() self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) def cmd_find(self, args): - maxsize = int(args[0]) - blobsizes = dict(self.gen_large_blobs('--all', maxsize)) + minsize = 0 + if len(args): + minsize = int(args[0]) + blobsizes = dict(self.gen_large_blobs('--all', int(minsize))) time0 = time.time() # Find all names assumed by large blobs (those in blobsizes) pathsizes = collections.defaultdict(lambda:set()) From 3651da53ff2086cf4a9d79d6a0ed12ec62dbee17 Mon Sep 17 00:00:00 2001 From: woytowitz Date: Fri, 28 Nov 2014 21:58:23 -0500 Subject: [PATCH 10/18] Cleanup help descriptions --- git-fat | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/git-fat b/git-fat index 8e62255..cad4227 100755 --- a/git-fat +++ b/git-fat @@ -794,27 +794,27 @@ class GitFat(object): print('Typical git operations, when is git-fat involved and what it does when it is invoked:') print('- git clone ... : See git checkout.') print('- git fetch : git-fat is not involved.') - print('- git fat pull : Runs git pull') + print('- git pull : Runs git fat pull via post-merge or pre-rebase githook') print(' Brings in data for orphan objects, computed per HEAD (including history) of your working copy, from \'remote\' to \'share\'.') print(' Creates a sym link: \'objdir/...\' -> \'share/...\' for each orphan object that HEAD points at (no history) ==> No longer orphan.') print(' Lets git invoke git-fat\'s filter-smudge function') print('- git fat pull --all : Same as git fat pull except that the orphan objects are computed across all git objects,') print(' not just per what HEAD (including history) of your working copy.') - print('- git fat push : reference & fat files (not sym links), where & is the intersection operation, is pushed out to:') - print(' - \'remote\'. Diff the same file set between \'objdir\' and \'remote\'. Abort if mismatches.') - print(' - \'share\'. Diff the same file set between \'objdir\' and \'share\'. Abort if mismatches.') + print('- git push : Runs git fat push via pre-push githook') + print(' reference & fat files (not sym links), where & is the intersection operation, is pushed out to:') + print(' \'remote\'. Diff the same file set between \'objdir\' and \'remote\'. Abort if mismatches.') + print(' \'share\'. Diff the same file set between \'objdir\' and \'share\'. Abort if mismatches.') print(' Replaces each such file in \'objdir\' with a sym link, pointing at \'share/...\'.') - print(' Runs git push') print('- git fat push --all : Same steps as git fat push except that reference is computed across all git objects,') print(' not just what your HEAD (including history) is pointing at.') print('') - print('- git checkout ... : git invokes git-fat filter-smudge for each file .gitattributes indicates so.') - print('- git add : git invokes git-fat filter-clean if .gitattributes has a matching line for .') + print('- git checkout ... : git invokes git-fat filter-smudge for each file configured in .gitattributes and post-merge githook.') + print('- git add : git invokes git-fat filter-clean for each file configured in .gitattributes.') print('- git commit -a [...] : See git add.') - print('- git merge ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') - print('- git rebase ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') - print('- git cherry-pick ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') - print('- git revert ... : git invokes git-fat filter-clean and filter-smudge for each file .gitattributes indicates so.') + print('- git merge ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.') + print('- git rebase ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and pre-rebase githook.') + print('- git cherry-pick ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.') + print('- git revert ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.') if __name__ == '__main__': From 079ca7c25e5c7c289b880fd5dc79fbce83a7ab6e Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Mon, 1 Dec 2014 11:32:32 -0500 Subject: [PATCH 11/18] Fixed bug with file renames that have same blob but different tree objects. Now the objdir is searched for a matching digest and copied to new objfile location. --- git-fat | 173 +++++++++++++++++++++++--------------------- test-retroactive.sh | 1 + 2 files changed, 91 insertions(+), 83 deletions(-) diff --git a/git-fat b/git-fat index cad4227..23e3cd9 100755 --- a/git-fat +++ b/git-fat @@ -80,7 +80,6 @@ def cat(instream, outstream): return cat_iter(readblocks(instream), outstream) def difftreez_reader(input): """Incremental reader for git diff-tree -z output - :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ... """ buffer = [] @@ -155,7 +154,7 @@ class GitFat(object): def get_fat_rsync_dirs(self): cfgpath = self.get_fat_config() remote = gitconfig_get('rsync.remote', file=cfgpath) - share = gitconfig_get('rsync.share', file=cfgpath) + share = gitconfig_get('git-fat.share') if share is None: share = gitconfig_get('rsync.local', file=cfgpath) if share is None: @@ -193,24 +192,43 @@ class GitFat(object): dst = share # If share is set up, smudge filter will take care of linking self.objdir to share during merge|rebase step of 'pull', therefore always pull from remote to share here. self.verbose('git-fat pushpull_to_rsync: %d file(s) found to pull from %s' % (cnt, remote)) return self.get_rsync_command(src, dst) - def symlink_to_share(self, digest): - 'Create self.objdir/digest (links) pointing at share/digest if the configuration of share is set up appropriately' + def match_digest(self, objpath): + if os.access(objpath, os.R_OK): + return objpath + result = [] + digest = os.path.splitext(objpath)[1] + for root, dirs, files in os.walk(self.objdir): + for name in files: + if fnmatch.fnmatch(name, '*' + digest): + result.append(os.path.join(root, name)) + if not result: + result.append(objpath) + return result[0] + def match_from_objdir(self, objpath): + matchpath = self.match_digest(objpath) + if matchpath != objpath and os.path.exists(matchpath): + mkdir_p(os.path.dirname(objpath)) + shutil.copy(matchpath,objpath) + def symlink_to_share(self, objfile): + 'Create self.objdir/objfile (links) pointing at share/objfile if the configuration of share is set up appropriately' + # Do nothing if share is not set up or points at a non-existing path. (remote, share) = self.get_fat_rsync_dirs() - if share == self.objdir or not os.path.exists(share): # Do nothing if share is not set up or points at a non-existing path. + if share == self.objdir or not os.path.exists(share): return - sharefile = os.path.join(share, digest) - objfile = os.path.join(self.objdir, digest) - if os.path.lexists(objfile): - os.remove(objfile) - mkdir_p(os.path.dirname(objfile)) - os.symlink(sharefile, objfile) # Note that sharefile may not exist, i.e. may be creating a broken symlink. It is OK as we may not have pulled from remote (to share) yet. - def convert_digest_to_symlink(self, files, share): - 'Replace self.objdir/digest in files with links pointing at share/digest' - for digest in files: - fat = os.path.join(self.objdir, digest) - sharefile = os.path.join(share, digest) - os.remove(fat) - os.symlink(sharefile, fat) + objpath = os.path.join(self.objdir, objfile) + if os.path.lexists(objpath): + os.remove(objpath) + sharepath = os.path.join(share, objfile) + # Note that sharepath may not exist, i.e. may be creating a broken symlink. It is OK as we may not have pulled from remote (to share) yet. + mkdir_p(os.path.dirname(objpath)) + os.symlink(sharepath, objpath) + def convert_objfile_to_symlink(self, files, share): + 'Replace self.objdir/objfile in files with links pointing at share/objfile' + for objfile in files: + objpath = os.path.join(self.objdir, objfile) + sharepath = os.path.join(share, objfile) + os.remove(objpath) + os.symlink(sharepath, objpath) def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode_v1(self, digest, bytes): @@ -288,15 +306,16 @@ class GitFat(object): # Skip empty files if bytes != 0: digest = h.hexdigest() - objfile = os.path.join(self.objdir, fname + '.' + digest) + objfile = fname + '.' + digest + objpath = os.path.join(self.objdir, objfile) if not ishanging: - if os.path.exists(objfile): + if os.path.exists(objpath): os.remove(tmpname) else: # Set permissions for the new file using the current umask os.chmod(tmpname, int('444', 8) & ~umask()) - mkdir_p(os.path.dirname(objfile)) - os.rename(tmpname, objfile) + mkdir_p(os.path.dirname(objpath)) + os.rename(tmpname, objpath) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True outstreamclean.write(self.encode(digest, bytes)) @@ -316,16 +335,16 @@ class GitFat(object): self.setup() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest - result = fname + '.' + result; - objfile = os.path.join(self.objdir, result) - if not os.access(objfile, os.R_OK): - self.verbose('git-fat filter-smudge: fat object missing %s - will query share, if available, and try again' % objfile) - self.symlink_to_share(result) - if not os.access(objfile, os.R_OK): - self.verbose('git-fat filter-smudge: fat object missing %s - will query remote, if available, and try again' % objfile) - self.pull_from_remote(set([result])) + objfile = fname + '.' + result + objpath = os.path.join(self.objdir, objfile) + if not os.access(objpath, os.R_OK): + self.match_from_objdir(objpath) + if not os.access(objpath, os.R_OK): + self.symlink_to_share(objfile) + if not os.access(objpath, os.R_OK): + self.pull_from_remote(set([objfile])) try: - cat(open(objfile), sys.stdout) + cat(open(objpath), sys.stdout) self.verbose('git-fat filter-smudge: restoring from %s' % objfile) except IOError: # file not found self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) @@ -424,18 +443,20 @@ class GitFat(object): def fat_files(self): fatfiles = set() - for fatfile in self.catalog_objects(): - if fatfile != '' and not os.path.islink(os.path.join(self.objdir, fatfile)): - fatfiles.add(fatfile) + for objfile in self.catalog_objects(): + if objfile != '' and not os.path.islink(os.path.join(self.objdir, objfile)): + fatfiles.add(objfile) return fatfiles def cmd_status(self, args): self.setup() catalog = self.catalog_objects() + self.verbose("catalog %s\n" % catalog) refargs = dict() if '--all' in args: refargs['all'] = True referenced = self.referenced_objects(**refargs) + self.verbose("referenced %s\n" % referenced) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: @@ -474,24 +495,15 @@ class GitFat(object): p.communicate(input='\x00'.join(files)) if p.returncode: sys.exit(p.returncode) - self.convert_digest_to_symlink(files, share) - def git_upstream_exists(self): - args = ['git', 'name-rev', '@{upstream}'] - p = subprocess.Popen(args, stdout=subprocess.PIPE) - output = p.communicate()[0].strip() - if p.returncode or not output: - return False - else: - return True + self.convert_objfile_to_symlink(files, share) def cmd_pre_push(self, args): - if self.git_upstream_exists(): - self.cmd_push("") + self.cmd_push("") def cmd_push(self, args): 'Push all fat files that I have stored and referenced' self.setup() (remote, share) = self.get_fat_rsync_dirs() - if remote is None: - return + if remote is None: + return # Default to push only those objects referenced by current HEAD (includes history) pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.fat_files() @@ -501,14 +513,14 @@ class GitFat(object): 'Update any stale files in the present working tree' self.assert_init_done() for digest, fname in self.orphan_files(): - digest = fname + '.' + digest; - objpath = os.path.join(self.objdir, digest) + objfile = fname + '.' + digest + objpath = os.path.join(self.objdir, objfile) if not os.access(objpath, os.R_OK): - self.symlink_to_share(digest) + self.symlink_to_share(objfile) if not os.access(objpath, os.R_OK): - self.pull_from_remote(set([digest])) + self.pull_from_remote(set([objfile])) if os.access(objpath, os.R_OK): - print('Restoring %s -> %s' % (digest, fname)) + print('Restoring %s -> %s' % (objfile, fname)) # The output of our smudge filter depends on the existence of # the file in .git/fat/objects, but git caches the file stat # from the previous time the file was smudged, therefore it @@ -519,7 +531,7 @@ class GitFat(object): # This re-smudge is essentially a copy that restores permissions. subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname]) elif show_orphans: - print('Data unavailable: %s %s' % (digest,fname)) + print('Data unavailable: %s' % objfile) def pull_from_remote(self, files): 'Since this sub is also used by cmd_filter_smudge, stdout needs to be nothing but what git expects => throw away stdout of rsync' if len(files) == 0: @@ -531,17 +543,15 @@ class GitFat(object): if p.returncode: sys.exit(p.returncode) def cmd_post_merge(self, args): - if self.git_upstream_exists(): - self.cmd_pull("") + self.cmd_pull("") def cmd_pre_rebase(self, args): - if self.git_upstream_exists(): - self.cmd_pull("") + self.cmd_pull("") def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() (remote, share) = self.get_fat_rsync_dirs() - if remote is None: - return + if remote is None: + return refargs = dict() if '--all' in args: refargs['all'] = True @@ -577,26 +587,26 @@ class GitFat(object): def cmd_gc(self): garbage = self.catalog_objects() - self.referenced_objects() print('Unreferenced objects to remove: %d' % len(garbage)) - for obj in garbage: - fname = os.path.join(self.objdir, obj) - print('%10d %s' % (os.stat(fname).st_size, obj)) - os.remove(fname) + for objfile in garbage: + objpath = os.path.join(self.objdir, objfile) + print('%10d %s' % (os.stat(objpath).st_size, objfile)) + os.remove(objpath) def cmd_verify(self): """Print details of git-fat objects with incorrect data hash""" corrupted_objects = [] - for obj in self.catalog_objects(): - fname = os.path.join(self.objdir, obj) + for objfile in self.catalog_objects(): + objpath = os.path.join(self.objdir, objfile) h = hashlib.new('sha1') - for block in readblocks(open(fname)): + for block in readblocks(open(objpath)): h.update(block) data_hash = h.hexdigest() - if not obj.endswith(data_hash): - corrupted_objects.append((obj, data_hash)) + if not objfile.endswith(data_hash): + corrupted_objects.append((objfile, data_hash)) if corrupted_objects: print('Corrupted objects: %d' % len(corrupted_objects)) - for obj, data_hash in corrupted_objects: - print('%s data hash is %s' % (obj, data_hash)) + for objfile, data_hash in corrupted_objects: + print('%s data hash is %s' % (objfile, data_hash)) sys.exit(1) def fat_init_one(self, var, value): @@ -606,32 +616,28 @@ class GitFat(object): return True return False def fat_init_all(self): - ret = False - + ret = False post_merge = os.path.join(self.gitdir, 'hooks', 'post-merge') if not os.path.isfile(post_merge): with open(post_merge, "w") as f: - lines = ["#!/bin/sh -ex\n", "git fat post-merge \"$@\"\n"] + lines = ["#!/bin/sh\n", "git-fat post-merge \"$@\"\n"] f.writelines(lines) os.chmod(post_merge, 0755) ret = True - pre_rebase = os.path.join(self.gitdir, 'hooks', 'pre-rebase') if not os.path.isfile(pre_rebase): with open(pre_rebase, "w") as f: - lines = ["#!/bin/sh -ex\n", "git fat pre-rebase \"$@\"\n"] + lines = ["#!/bin/sh\n", "git-fat pre-rebase \"$@\"\n"] f.writelines(lines) os.chmod(pre_rebase, 0755) ret = True - pre_push = os.path.join(self.gitdir, 'hooks', 'pre-push') if not os.path.isfile(pre_push): with open(pre_push, "w") as f: - lines = ["#!/bin/sh -ex\n", "git fat pre-push \"$@\"\n"] + lines = ["#!/bin/sh\n", "git-fat pre-push \"$@\"\n"] f.writelines(lines) os.chmod(pre_push, 0755) ret = True - ret = self.fat_init_one('filter.fat.clean', 'git-fat filter-clean %f') or ret ret = self.fat_init_one('filter.fat.smudge', 'git-fat filter-smudge %f') or ret ret = self.fat_init_one('filter.fat.required', 'true') or ret @@ -639,7 +645,7 @@ class GitFat(object): def cmd_init(self): self.setup() if self.fat_init_all() is True: - self.cmd_post_merge("") + #self.cmd_post_merge("") print('Initialized git fat') def gen_large_blobs(self, revs, threshsize): """Build dict of all blobs""" @@ -758,16 +764,17 @@ class GitFat(object): except GitFat.ConfigError: (remote, share) = ('', objdir) pass + print('- remote : Rsync destination containing pushed out fat files.') + print(' This rsync destination is where everyone who uses this repo pushes their fat files onto.') + print(' (' + remote + ')') + print('remote is configured via ' + self.get_fat_config()) print('- share : Directory containing pushed out fat files.') print(' To increase performance, you are recommended to set this spot to be on a local NAS on your site.') print(' This directory is shared across all your repos/wcps as well as by your peers if they are using it.') print(' Setting up this directory offers disk space saving as well as allows fine grained push operation => faster push performance.') print(' If this configuration option is not set up, its value defaults to \'objdir\'.') print(' (' + share + ')') - print('- remote : Rsync destination containing pushed out fat files.') - print(' This rsync destination is where everyone who uses this repo pushes their fat files onto.') - print(' (' + remote + ')') - print('share and remote are configured via ' + self.get_fat_config()) + print('share is configured via \'git config git-fat.share\'') print() # Definitions print('Definitions used by git-fat:') diff --git a/test-retroactive.sh b/test-retroactive.sh index eeeeb7a..7a6f359 100755 --- a/test-retroactive.sh +++ b/test-retroactive.sh @@ -44,6 +44,7 @@ git checkout master cat > .gitfat < Date: Mon, 1 Dec 2014 14:37:41 -0500 Subject: [PATCH 12/18] Add git2fat script which takes an existing git repo and converts it into a new git fat repo --- git2fat | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100755 git2fat diff --git a/git2fat b/git2fat new file mode 100755 index 0000000..a509360 --- /dev/null +++ b/git2fat @@ -0,0 +1,110 @@ +#!/bin/bash + +test "$4" || { + echo "Convert an existing git repo to a new git fat repo." + echo "usage: $(basename $0) {}" ; + echo " The should be non-existent or an already created empty repository." + exit 1 ; +} + +SRC_REPO="$(readlink -f $1)" && shift +test "$1" && DEST_REPO="$(readlink -f $1)" && shift +test "$1" && GIT_ATTRIBUTES="$(readlink -f $1)" && shift +test "$1" && GIT_FAT_REMOTE="$1" && shift +test "$1" && GIT_FAT_SHARE="$(readlink -f $1)" && shift + +test "$DEST_REPO" || DEST_REPO="${SRC_REPO}.fat" + +echo; echo "Contents of .gitattributes file" +cat $GIT_ATTRIBUTES + +GIT_FAT="$(mktemp)" +trap "rm -f $GIT_FAT" EXIT SIGTERM +cat << EOF > $GIT_FAT +[rsync] +remote = $GIT_FAT_REMOTE/$(basename $SRC_REPO) +EOF + + +echo ; echo "Contents of .gitfat file" +test "$GIT_FAT" && cat $GIT_FAT + +GIT_ATTRIBUTES_SIZE="$(stat -c %s $GIT_ATTRIBUTES)" +GIT_FAT_SIZE="$(stat -c %s $GIT_FAT)" + +echo; echo "Exporting data from $SRC_REPO" +pushd $SRC_REPO &>/dev/null && { + + # Add in blob for .gitattributes file + cat << EOF > .fast-export +blob +mark :1000000 +data $((GIT_ATTRIBUTES_SIZE + 1)) +EOF + cat $GIT_ATTRIBUTES >> .fast-export + echo >> .fast-export + + # Add in blob for .gitfat file + cat << EOF >> .fast-export +blob +mark :1000001 +data $((GIT_FAT_SIZE + 1)) +EOF + cat $GIT_FAT >> .fast-export + echo >> .fast-export + + # Run fast export splitting output on the first commit + git fast-export --all | csplit - '/^M [0-9]* :[0-9]* /' >/dev/null + + # Add fast-export data before first commit + cat xx00 >> .fast-export + + # Add .gitattribute/.gitfat files to first commit + cat << EOF >> .fast-export +M 100644 :1000000 .gitattributes +M 100644 :1000001 .gitfat +EOF + + # Add the rest of the fast-export data + cat xx01 >> .fast-export + + rm -f xx00 xx01 + popd &>/dev/null +} + +echo ; echo "Importing data into $DEST_REPO" + +# Create dest repo if it does not exist +test -e "$DEST_REPO" || { + mkdir -p $DEST_REPO + git init $DEST_REPO +} + +# Add commits to the destination repository +pushd $DEST_REPO &>/dev/null && { + git fat init + cat $SRC_REPO/.fast-export | git fast-import + + echo ; echo "Re-writing history with git fat enabled for $DEST_REPO" + git reset --hard HEAD + git commit -am'Temporary commit of modifications generated from new .gitattributes file' && TEMP_COMMIT="SUCCESS" || TEMP_COMMIT="" + sed 's/ \+filter=fat.*$//' $GIT_ATTRIBUTES | grep -v "^#" > /tmp/fat-filter-files + git filter-branch --index-filter 'git fat index-filter /tmp/fat-filter-files' --tag-name-filter cat -- --all + git rm --cached -rq . + test "$TEMP_COMMIT" && git reset --hard HEAD^ || git reset --hard HEAD + git add . + git commit -am'Modifications generated from new .gitattributes file' + + echo ; echo "Cleaning up $DEST_REPO" + git for-each-ref --format="%(refname)" refs/original/ | xargs -rn 1 git update-ref -d + git reflog expire --expire=now --all + git gc --prune=now + test "$GIT_FAT_SHARE" && git config git-fat.share $GIT_FAT_SHARE + git fat push --all + git fat gc + #git fat checkout + #git reset --hard HEAD + #git checkout . + + popd &>/dev/null +} From fdd6f50786c23c30e351a2026475f7febb5f3313 Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Mon, 1 Dec 2014 15:14:56 -0500 Subject: [PATCH 13/18] Remove debug prints accidentally checked in. --- git-fat | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/git-fat b/git-fat index 23e3cd9..f183140 100755 --- a/git-fat +++ b/git-fat @@ -186,11 +186,11 @@ class GitFat(object): if push: src = self.objdir dst = remote - self.verbose('git-fat pushpull_to_rsync: %d file(s) found to push to %s' % (cnt, remote)) + self.verbose('git-fat : %d file(s) found to push to %s' % (cnt, remote)) else: src = remote dst = share # If share is set up, smudge filter will take care of linking self.objdir to share during merge|rebase step of 'pull', therefore always pull from remote to share here. - self.verbose('git-fat pushpull_to_rsync: %d file(s) found to pull from %s' % (cnt, remote)) + self.verbose('git-fat : %d file(s) found to pull from %s' % (cnt, remote)) return self.get_rsync_command(src, dst) def match_digest(self, objpath): if os.access(objpath, os.R_OK): @@ -451,12 +451,10 @@ class GitFat(object): def cmd_status(self, args): self.setup() catalog = self.catalog_objects() - self.verbose("catalog %s\n" % catalog) refargs = dict() if '--all' in args: refargs['all'] = True referenced = self.referenced_objects(**refargs) - self.verbose("referenced %s\n" % referenced) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: From 7a993cdcca6cf0f5917b2244509f23557bb17361 Mon Sep 17 00:00:00 2001 From: woytowitz Date: Mon, 1 Dec 2014 20:13:08 -0500 Subject: [PATCH 14/18] Added new git fat share command to easily change the share path value --- git-fat | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/git-fat b/git-fat index f183140..3b70726 100755 --- a/git-fat +++ b/git-fat @@ -120,6 +120,13 @@ def gitconfig_set(name, value, file=None): args += [name, value] p = subprocess.check_call(args) +def gitconfig_unset(name, file=None): + args = ['git', 'config', '--unset'] + if file is not None: + args += ['--file', file] + args += [name] + p = subprocess.call(args) + class GitFat(object): DecodeError = RuntimeError ConfigError = RuntimeError @@ -151,12 +158,12 @@ class GitFat(object): self.fat_init_all() # Upgrade old git-fat setup to the latest one def get_fat_config(self): return os.path.join(self.gitroot,'.gitfat') - def get_fat_rsync_dirs(self): + def get_fat_configs(self): cfgpath = self.get_fat_config() remote = gitconfig_get('rsync.remote', file=cfgpath) share = gitconfig_get('git-fat.share') if share is None: - share = gitconfig_get('rsync.local', file=cfgpath) + share = gitconfig_get('share.default', file=cfgpath) if share is None: share = self.objdir return remote, share @@ -182,7 +189,7 @@ class GitFat(object): cmd += [src + '/', dst + '/'] return cmd def pushpull_to_rsync(self,push,cnt): - (remote, share) = self.get_fat_rsync_dirs() + (remote, share) = self.get_fat_configs() if push: src = self.objdir dst = remote @@ -212,7 +219,7 @@ class GitFat(object): def symlink_to_share(self, objfile): 'Create self.objdir/objfile (links) pointing at share/objfile if the configuration of share is set up appropriately' # Do nothing if share is not set up or points at a non-existing path. - (remote, share) = self.get_fat_rsync_dirs() + (remote, share) = self.get_fat_configs() if share == self.objdir or not os.path.exists(share): return objpath = os.path.join(self.objdir, objfile) @@ -481,7 +488,7 @@ class GitFat(object): if p.returncode: sys.exit(p.returncode) def push_to_share(self, files): - (remote, share) = self.get_fat_rsync_dirs() + (remote, share) = self.get_fat_configs() # Do nothing if share is not set up or points at a non-existing path. if share == self.objdir or not os.path.exists(share): return @@ -493,13 +500,13 @@ class GitFat(object): p.communicate(input='\x00'.join(files)) if p.returncode: sys.exit(p.returncode) - self.convert_objfile_to_symlink(files, share) + self.convert_objfile_to_symlink(self.catalog_objects(), share) def cmd_pre_push(self, args): self.cmd_push("") def cmd_push(self, args): 'Push all fat files that I have stored and referenced' self.setup() - (remote, share) = self.get_fat_rsync_dirs() + (remote, share) = self.get_fat_configs() if remote is None: return # Default to push only those objects referenced by current HEAD (includes history) @@ -547,7 +554,7 @@ class GitFat(object): def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() - (remote, share) = self.get_fat_rsync_dirs() + (remote, share) = self.get_fat_configs() if remote is None: return refargs = dict() @@ -750,7 +757,17 @@ class GitFat(object): updateindex.stdin.close() lsfiles.wait() updateindex.wait() - + def cmd_share(self, args): + if len(args): + if args[0] == 'default': + gitconfig_unset('git-fat.share') + else: + gitconfig_set('git-fat.share',args[0]) + (remote, share) = self.get_fat_configs() + self.convert_objfile_to_symlink(self.catalog_objects(),share) + else: + (remote, share) = self.get_fat_configs() + print('%s' % share) def cmd_help(self): objdir = os.path.join(self.gitroot, self.objdir) # Directories @@ -758,21 +775,26 @@ class GitFat(object): print('- objdir : Contains fat objects(files and/or shared links). Shared links will only exist if \'share\' is configured.') print(' (' + objdir + ')') try: - (remote, share) = self.get_fat_rsync_dirs() + (remote, share) = self.get_fat_configs() except GitFat.ConfigError: (remote, share) = ('', objdir) pass + if remote is None: + remote = 'null' + if share is None: + share = 'null' print('- remote : Rsync destination containing pushed out fat files.') print(' This rsync destination is where everyone who uses this repo pushes their fat files onto.') print(' (' + remote + ')') - print('remote is configured via ' + self.get_fat_config()) + print('remote is configured via rsync.remote in ' + self.get_fat_config()) print('- share : Directory containing pushed out fat files.') print(' To increase performance, you are recommended to set this spot to be on a local NAS on your site.') print(' This directory is shared across all your repos/wcps as well as by your peers if they are using it.') print(' Setting up this directory offers disk space saving as well as allows fine grained push operation => faster push performance.') print(' If this configuration option is not set up, its value defaults to \'objdir\'.') print(' (' + share + ')') - print('share is configured via \'git config git-fat.share\'') + print('default share is configured via share.default in ' + self.get_fat_config()) + print('share is configured via \'git fat share {default|}\'') print() # Definitions print('Definitions used by git-fat:') @@ -853,6 +875,8 @@ if __name__ == '__main__': fat.cmd_find(sys.argv[2:]) elif cmd == 'index-filter': fat.cmd_index_filter(sys.argv[2:]) + elif cmd == 'share': + fat.cmd_share(sys.argv[2:]) elif cmd == 'help': fat.cmd_help() else: From 55e61e71c630144ce9830479022e9031ee97018b Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Mon, 1 Dec 2014 20:17:07 -0500 Subject: [PATCH 15/18] Update git2fat to use new default share configuration --- git-fat | 3 ++- git2fat | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/git-fat b/git-fat index 3b70726..7f7759e 100755 --- a/git-fat +++ b/git-fat @@ -594,7 +594,7 @@ class GitFat(object): print('Unreferenced objects to remove: %d' % len(garbage)) for objfile in garbage: objpath = os.path.join(self.objdir, objfile) - print('%10d %s' % (os.stat(objpath).st_size, objfile)) + print('%s' % objfile) os.remove(objpath) def cmd_verify(self): @@ -816,6 +816,7 @@ class GitFat(object): print('- git fat checkout : Converts all orphan objects into non-orphan state, while automatically executing \'pull\'-like functionality for the specific orphan file.') print('- git fat gc : Deletes all garbage objects') print('- git fat verify : Report corrupt fat objects in the catalog') + print('- git fat share : Set/Get current share setting') print('- More info? : Define export var GIT_FAT_VERBOSE and continue using git-fat.') print('') print('Typical git operations, when is git-fat involved and what it does when it is invoked:') diff --git a/git2fat b/git2fat index a509360..ddab5fd 100755 --- a/git2fat +++ b/git2fat @@ -23,8 +23,16 @@ trap "rm -f $GIT_FAT" EXIT SIGTERM cat << EOF > $GIT_FAT [rsync] remote = $GIT_FAT_REMOTE/$(basename $SRC_REPO) + EOF +test "$GIT_FAT_SHARE" && { +cat << EOF >> $GIT_FAT +[share] +default = $GIT_FAT_SHARE/$(basename $SRC_REPO) + +EOF +} echo ; echo "Contents of .gitfat file" test "$GIT_FAT" && cat $GIT_FAT @@ -99,7 +107,6 @@ pushd $DEST_REPO &>/dev/null && { git for-each-ref --format="%(refname)" refs/original/ | xargs -rn 1 git update-ref -d git reflog expire --expire=now --all git gc --prune=now - test "$GIT_FAT_SHARE" && git config git-fat.share $GIT_FAT_SHARE git fat push --all git fat gc #git fat checkout From 55036d14453311316a23939c8cb6f71d97b58786 Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Wed, 3 Dec 2014 22:19:39 -0500 Subject: [PATCH 16/18] Update the hook scripts if anything is initialized. Put git-fat init into each template hook. Now when git clone/init occurs, git-fat init is called and overwrites the initial hook scripts with the real ones. --- git-fat | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/git-fat b/git-fat index 7f7759e..f23fc7a 100755 --- a/git-fat +++ b/git-fat @@ -622,30 +622,33 @@ class GitFat(object): return False def fat_init_all(self): ret = False + ret = self.fat_init_one('filter.fat.clean', 'git-fat filter-clean %f') or ret + ret = self.fat_init_one('filter.fat.smudge', 'git-fat filter-smudge %f') or ret + ret = self.fat_init_one('filter.fat.required', 'true') or ret post_merge = os.path.join(self.gitdir, 'hooks', 'post-merge') - if not os.path.isfile(post_merge): + if ret or not os.path.isfile(post_merge): with open(post_merge, "w") as f: lines = ["#!/bin/sh\n", "git-fat post-merge \"$@\"\n"] f.writelines(lines) os.chmod(post_merge, 0755) ret = True pre_rebase = os.path.join(self.gitdir, 'hooks', 'pre-rebase') - if not os.path.isfile(pre_rebase): + if ret or not os.path.isfile(pre_rebase): with open(pre_rebase, "w") as f: lines = ["#!/bin/sh\n", "git-fat pre-rebase \"$@\"\n"] f.writelines(lines) os.chmod(pre_rebase, 0755) ret = True pre_push = os.path.join(self.gitdir, 'hooks', 'pre-push') - if not os.path.isfile(pre_push): + if ret or not os.path.isfile(pre_push): with open(pre_push, "w") as f: lines = ["#!/bin/sh\n", "git-fat pre-push \"$@\"\n"] f.writelines(lines) os.chmod(pre_push, 0755) ret = True - ret = self.fat_init_one('filter.fat.clean', 'git-fat filter-clean %f') or ret - ret = self.fat_init_one('filter.fat.smudge', 'git-fat filter-smudge %f') or ret - ret = self.fat_init_one('filter.fat.required', 'true') or ret + pre_commit = os.path.join(self.gitdir, 'hooks', 'pre-commit') + if ret and os.path.isfile(pre_commit): + os.remove(pre_commit) return ret def cmd_init(self): self.setup() From 1bee9a65572609ebed67e51eab46b8c6c76abdc9 Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Thu, 4 Dec 2014 19:26:40 -0500 Subject: [PATCH 17/18] Alway call cmd_init, so git fat init is no longer needed (but still present for backward compat) --- git-fat | 25 ++++++++++--------------- test.sh | 13 ------------- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/git-fat b/git-fat index f23fc7a..e9172b8 100755 --- a/git-fat +++ b/git-fat @@ -146,8 +146,6 @@ class GitFat(object): return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) self.magiclen = magiclen(self.encode) # Current version self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions - def setup(self): - mkdir_p(self.objdir) def is_init_done(self): return gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge') def assert_init_done(self): @@ -335,11 +333,11 @@ class GitFat(object): The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' - self.setup() + self.cmd_init() self.filter_clean(sys.stdin, sys.stdout, fname) def cmd_filter_smudge(self, fname): - self.setup() + self.cmd_init() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = fname + '.' + result @@ -456,7 +454,6 @@ class GitFat(object): return fatfiles def cmd_status(self, args): - self.setup() catalog = self.catalog_objects() refargs = dict() if '--all' in args: @@ -502,10 +499,10 @@ class GitFat(object): sys.exit(p.returncode) self.convert_objfile_to_symlink(self.catalog_objects(), share) def cmd_pre_push(self, args): + self.cmd_init() self.cmd_push("") def cmd_push(self, args): 'Push all fat files that I have stored and referenced' - self.setup() (remote, share) = self.get_fat_configs() if remote is None: return @@ -548,12 +545,13 @@ class GitFat(object): if p.returncode: sys.exit(p.returncode) def cmd_post_merge(self, args): + self.cmd_init() self.cmd_pull("") def cmd_pre_rebase(self, args): + self.cmd_init() self.cmd_pull("") def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' - self.setup() (remote, share) = self.get_fat_configs() if remote is None: return @@ -626,34 +624,30 @@ class GitFat(object): ret = self.fat_init_one('filter.fat.smudge', 'git-fat filter-smudge %f') or ret ret = self.fat_init_one('filter.fat.required', 'true') or ret post_merge = os.path.join(self.gitdir, 'hooks', 'post-merge') - if ret or not os.path.isfile(post_merge): + if not os.path.isfile(post_merge): with open(post_merge, "w") as f: lines = ["#!/bin/sh\n", "git-fat post-merge \"$@\"\n"] f.writelines(lines) os.chmod(post_merge, 0755) ret = True pre_rebase = os.path.join(self.gitdir, 'hooks', 'pre-rebase') - if ret or not os.path.isfile(pre_rebase): + if not os.path.isfile(pre_rebase): with open(pre_rebase, "w") as f: lines = ["#!/bin/sh\n", "git-fat pre-rebase \"$@\"\n"] f.writelines(lines) os.chmod(pre_rebase, 0755) ret = True pre_push = os.path.join(self.gitdir, 'hooks', 'pre-push') - if ret or not os.path.isfile(pre_push): + if not os.path.isfile(pre_push): with open(pre_push, "w") as f: lines = ["#!/bin/sh\n", "git-fat pre-push \"$@\"\n"] f.writelines(lines) os.chmod(pre_push, 0755) ret = True - pre_commit = os.path.join(self.gitdir, 'hooks', 'pre-commit') - if ret and os.path.isfile(pre_commit): - os.remove(pre_commit) return ret def cmd_init(self): - self.setup() + mkdir_p(self.objdir) if self.fat_init_all() is True: - #self.cmd_post_merge("") print('Initialized git fat') def gen_large_blobs(self, revs, threshsize): """Build dict of all blobs""" @@ -850,6 +844,7 @@ class GitFat(object): if __name__ == '__main__': fat = GitFat() + fat.cmd_init() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': fat.cmd_filter_clean(sys.argv[2]) diff --git a/test.sh b/test.sh index 2892802..0277bfc 100755 --- a/test.sh +++ b/test.sh @@ -37,19 +37,6 @@ git fat push cd .. git clone fat-test fat-test2 cd fat-test2 -# checkout and pull should fail in repo not yet init'ed for git-fat -git fat checkout && true -if [ $? -eq 0 ] -then - echo 'ERROR: "git fat checkout" in uninitialised repo should fail' - exit 1 -fi -git fat pull -- 'a.fa*' && true -if [ $? -eq 0 ] -then - echo 'ERROR: "git fat pull" in uninitialised repo should fail' - exit 1 -fi git fat init git fat pull -- 'a.fa*' cat a.fat From da155dc6e43bcb548fb8359a5cba0a6e6474c6f7 Mon Sep 17 00:00:00 2001 From: "Gunter.Woytowitz" Date: Fri, 5 Dec 2014 21:46:14 -0500 Subject: [PATCH 18/18] More updates for git share to allow easier recovery from share changing. --- git-fat | 62 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/git-fat b/git-fat index e9172b8..f6a83ef 100755 --- a/git-fat +++ b/git-fat @@ -161,7 +161,7 @@ class GitFat(object): remote = gitconfig_get('rsync.remote', file=cfgpath) share = gitconfig_get('git-fat.share') if share is None: - share = gitconfig_get('share.default', file=cfgpath) + share = gitconfig_get('share.default', file=cfgpath) if share is None: share = self.objdir return remote, share @@ -194,7 +194,10 @@ class GitFat(object): self.verbose('git-fat : %d file(s) found to push to %s' % (cnt, remote)) else: src = remote - dst = share # If share is set up, smudge filter will take care of linking self.objdir to share during merge|rebase step of 'pull', therefore always pull from remote to share here. + if os.path.exists(share): + dst = share + else: + dst = self.objdir self.verbose('git-fat : %d file(s) found to pull from %s' % (cnt, remote)) return self.get_rsync_command(src, dst) def match_digest(self, objpath): @@ -216,9 +219,9 @@ class GitFat(object): shutil.copy(matchpath,objpath) def symlink_to_share(self, objfile): 'Create self.objdir/objfile (links) pointing at share/objfile if the configuration of share is set up appropriately' - # Do nothing if share is not set up or points at a non-existing path. + # Do nothing if share is not set (remote, share) = self.get_fat_configs() - if share == self.objdir or not os.path.exists(share): + if share == self.objdir or not os.path.exists(share): return objpath = os.path.join(self.objdir, objfile) if os.path.lexists(objpath): @@ -232,8 +235,9 @@ class GitFat(object): for objfile in files: objpath = os.path.join(self.objdir, objfile) sharepath = os.path.join(share, objfile) - os.remove(objpath) - os.symlink(sharepath, objpath) + if os.path.exists(sharepath): + os.remove(objpath) + os.symlink(sharepath, objpath) def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode_v1(self, digest, bytes): @@ -333,11 +337,9 @@ class GitFat(object): The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' - self.cmd_init() self.filter_clean(sys.stdin, sys.stdout, fname) def cmd_filter_smudge(self, fname): - self.cmd_init() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = fname + '.' + result @@ -486,7 +488,7 @@ class GitFat(object): sys.exit(p.returncode) def push_to_share(self, files): (remote, share) = self.get_fat_configs() - # Do nothing if share is not set up or points at a non-existing path. + # Do nothing if share is not set up if share == self.objdir or not os.path.exists(share): return if len(files) == 0: @@ -499,7 +501,6 @@ class GitFat(object): sys.exit(p.returncode) self.convert_objfile_to_symlink(self.catalog_objects(), share) def cmd_pre_push(self, args): - self.cmd_init() self.cmd_push("") def cmd_push(self, args): 'Push all fat files that I have stored and referenced' @@ -534,6 +535,12 @@ class GitFat(object): subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname]) elif show_orphans: print('Data unavailable: %s' % objfile) + def remove_objdir_broken_symlinks(self, files): + for file in files: + objpath = os.path.join(self.objdir, file) + if os.path.lexists(objpath) and not os.path.exists(objpath): + self.verbose('remove broken symlink %s' % objpath) + os.remove(objpath) def pull_from_remote(self, files): 'Since this sub is also used by cmd_filter_smudge, stdout needs to be nothing but what git expects => throw away stdout of rsync' if len(files) == 0: @@ -545,18 +552,19 @@ class GitFat(object): if p.returncode: sys.exit(p.returncode) def cmd_post_merge(self, args): - self.cmd_init() + self.cmd_pull("") + def cmd_post_checkout(self, args): self.cmd_pull("") def cmd_pre_rebase(self, args): - self.cmd_init() self.cmd_pull("") def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' (remote, share) = self.get_fat_configs() if remote is None: return + self.remove_objdir_broken_symlinks(self.catalog_objects()) refargs = dict() - if '--all' in args: + if not len(args) or '--all' in args: refargs['all'] = True for arg in args: if arg.startswith('-') or len(arg) != 40: @@ -630,6 +638,13 @@ class GitFat(object): f.writelines(lines) os.chmod(post_merge, 0755) ret = True + post_checkout = os.path.join(self.gitdir, 'hooks', 'post-checkout') + if not os.path.isfile(post_checkout): + with open(post_checkout, "w") as f: + lines = ["#!/bin/sh\n", "git-fat post-checkout \"$@\"\n"] + f.writelines(lines) + os.chmod(post_checkout, 0755) + ret = True pre_rebase = os.path.join(self.gitdir, 'hooks', 'pre-rebase') if not os.path.isfile(pre_rebase): with open(pre_rebase, "w") as f: @@ -760,11 +775,18 @@ class GitFat(object): gitconfig_unset('git-fat.share') else: gitconfig_set('git-fat.share',args[0]) - (remote, share) = self.get_fat_configs() - self.convert_objfile_to_symlink(self.catalog_objects(),share) - else: - (remote, share) = self.get_fat_configs() - print('%s' % share) + + (remote, share) = self.get_fat_configs() + if share != self.objdir: + try: + mkdir_p(share) + except OSError: + print('Share path \'%s\' does not exist.' % share) + return + + (remote, share) = self.get_fat_configs() + print('%s' % share) + def cmd_help(self): objdir = os.path.join(self.gitroot, self.objdir) # Directories @@ -833,7 +855,7 @@ class GitFat(object): print('- git fat push --all : Same steps as git fat push except that reference is computed across all git objects,') print(' not just what your HEAD (including history) is pointing at.') print('') - print('- git checkout ... : git invokes git-fat filter-smudge for each file configured in .gitattributes and post-merge githook.') + print('- git checkout ... : git invokes git-fat filter-smudge for each file configured in .gitattributes and post-checkout githook.') print('- git add : git invokes git-fat filter-clean for each file configured in .gitattributes.') print('- git commit -a [...] : See git add.') print('- git merge ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.') @@ -856,6 +878,8 @@ if __name__ == '__main__': fat.cmd_pre_rebase(sys.argv[2:]) elif cmd == 'post-merge': fat.cmd_post_merge(sys.argv[2:]) + elif cmd == 'post-checkout': + fat.cmd_post_checkout(sys.argv[2:]) elif cmd == 'init': fat.cmd_init() elif cmd == 'status':