blob: 86d6a980cdf24aa8af0f835e5058392b618fda15 [file] [log] [blame]
Dominik Riebeling909b96f2012-02-05 19:17:53 +01001#!/usr/bin/python
2# __________ __ ___.
3# Open \______ \ ____ ____ | | _\_ |__ _______ ___
4# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7# \/ \/ \/ \/ \/
8#
9# Copyright (c) 2012 Dominik Riebeling
10#
11# All files in this archive are subject to the GNU General Public License.
12# See the file COPYING in the source tree root for full license agreement.
13#
14# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
15# KIND, either express or implied.
16#
17
18'''Scrape files from a git repository.
19
20This module provides functions to get a subset of files from a git repository.
21The files to retrieve can be specified, and the git tree to work on can be
Dominik Riebeling972b8942012-05-14 23:01:19 +020022specified. That way arbitrary trees can be retrieved (like a subset of files
Dominik Riebeling909b96f2012-02-05 19:17:53 +010023for a given tag).
24
25Retrieved files can be packaged into a bzip2 compressed tarball or stored in a
26given folder for processing afterwards.
27
28Calls git commands directly for maximum compatibility.
29'''
30
31import re
32import subprocess
33import os
34import tarfile
35import tempfile
36import shutil
37
38
39def get_refs(repo):
40 '''Get dict matching refs to hashes from repository pointed to by repo.
41 @param repo Path to repository root.
42 @return Dict matching hashes to each ref.
43 '''
Dominik Riebeling885db722012-04-22 21:32:35 +020044 print("Getting list of refs")
Dominik Riebeling91cf4a72012-04-29 11:38:23 +020045 output = subprocess.Popen(["git", "show-ref", "--abbrev"],
46 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
Dominik Riebeling909b96f2012-02-05 19:17:53 +010047 cmdout = output.communicate()
48 refs = {}
49
50 if len(cmdout[1]) > 0:
Dominik Riebeling885db722012-04-22 21:32:35 +020051 print("An error occured!\n")
52 print(cmdout[1])
Dominik Riebeling909b96f2012-02-05 19:17:53 +010053 return refs
54
55 for line in cmdout:
Dominik Riebeling885db722012-04-22 21:32:35 +020056 regex = re.findall(b'([a-f0-9]+)\s+(\S+)', line)
Dominik Riebeling909b96f2012-02-05 19:17:53 +010057 for r in regex:
58 # ref is the key, hash its value.
Dominik Riebeling885db722012-04-22 21:32:35 +020059 refs[r[1].decode()] = r[0].decode()
Dominik Riebeling909b96f2012-02-05 19:17:53 +010060
61 return refs
62
63
64def get_lstree(repo, start, filterlist=[]):
65 '''Get recursive list of tree objects for a given tree.
66 @param repo Path to repository root.
67 @param start Hash identifying the tree.
68 @param filterlist List of paths to retrieve objecs hashes for.
69 An empty list will retrieve all paths.
70 @return Dict mapping filename to blob hash
71 '''
72 output = subprocess.Popen(["git", "ls-tree", "-r", start],
73 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
74 cmdout = output.communicate()
75 objects = {}
76
77 if len(cmdout[1]) > 0:
Dominik Riebeling885db722012-04-22 21:32:35 +020078 print("An error occured!\n")
79 print(cmdout[1])
Dominik Riebeling909b96f2012-02-05 19:17:53 +010080 return objects
81
Dominik Riebeling885db722012-04-22 21:32:35 +020082 for line in cmdout[0].decode().split('\n'):
Frank Gevaerts3e4be682013-03-05 22:37:27 +010083 regex = re.findall(b'([0-9]+)\s+([a-z]+)\s+([0-9a-f]+)\s+(.*)',
Dominik Riebeling885db722012-04-22 21:32:35 +020084 line.encode())
Dominik Riebeling909b96f2012-02-05 19:17:53 +010085 for rf in regex:
86 # filter
87 add = False
88 for f in filterlist:
Dominik Riebeling885db722012-04-22 21:32:35 +020089 if rf[3].decode().find(f) == 0:
Dominik Riebeling909b96f2012-02-05 19:17:53 +010090 add = True
91
92 # If two files have the same content they have the same hash, so
93 # the filename has to be used as key.
94 if len(filterlist) == 0 or add == True:
95 if rf[3] in objects:
Dominik Riebeling885db722012-04-22 21:32:35 +020096 print("FATAL: key already exists in dict!")
Dominik Riebeling909b96f2012-02-05 19:17:53 +010097 return {}
Dominik Riebeling6b3c4be2013-06-08 23:56:33 +020098 objects[rf[3].decode()] = rf[2].decode()
Dominik Riebeling909b96f2012-02-05 19:17:53 +010099 return objects
100
101
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200102def get_file_timestamp(repo, tree, filename):
103 '''Get timestamp for a file.
104 @param repo Path to repository root.
105 @param tree Hash of tree to use.
106 @param filename Filename in tree
107 @return Timestamp as string.
108 '''
109 output = subprocess.Popen(
110 ["git", "log", "--format=%ai", "-n", "1", tree, filename],
111 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
112 cmdout = output.communicate()
113
114 return cmdout[0].decode().rstrip()
115
116
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100117def get_object(repo, blob, destfile):
118 '''Get an identified object from the repository.
119 @param repo Path to repository root.
120 @param blob hash for blob to retrieve.
121 @param destfile filename for blob output.
122 @return True if file was successfully written, False on error.
123 '''
124 output = subprocess.Popen(["git", "cat-file", "-p", blob],
125 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
126 cmdout = output.communicate()
127 # make sure output path exists
128 if len(cmdout[1]) > 0:
Dominik Riebeling885db722012-04-22 21:32:35 +0200129 print("An error occured!\n")
130 print(cmdout[1])
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100131 return False
132 if not os.path.exists(os.path.dirname(destfile)):
133 os.makedirs(os.path.dirname(destfile))
Dominik Riebeling64f71e62012-02-06 00:14:25 +0100134 f = open(destfile, 'wb')
Dominik Riebeling885db722012-04-22 21:32:35 +0200135 f.write(cmdout[0])
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100136 f.close()
137 return True
138
139
140def describe_treehash(repo, treehash):
141 '''Retrieve output of git-describe for a given hash.
142 @param repo Path to repository root.
143 @param treehash Hash identifying the tree / commit to describe.
144 @return Description string.
145 '''
146 output = subprocess.Popen(["git", "describe", treehash],
147 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
148 cmdout = output.communicate()
149 if len(cmdout[1]) > 0:
Dominik Riebeling885db722012-04-22 21:32:35 +0200150 print("An error occured!\n")
151 print(cmdout[1])
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100152 return ""
153 return cmdout[0].rstrip()
154
155
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200156def scrape_files(repo, treehash, filelist, dest="", timestamp_files=[]):
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100157 '''Scrape list of files from repository.
158 @param repo Path to repository root.
159 @param treehash Hash identifying the tree.
160 @param filelist List of files to get from repository.
161 @param dest Destination path for files. Files will get retrieved with full
162 path from the repository, and the folder structure will get
163 created below dest as necessary.
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200164 @param timestamp_files List of files to also get the last modified date.
165 WARNING: this is SLOW!
166 @return Destination path, filename:timestamp dict.
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100167 '''
Dominik Riebeling885db722012-04-22 21:32:35 +0200168 print("Scraping files from repository")
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100169
170 if dest == "":
171 dest = tempfile.mkdtemp()
172 treeobjects = get_lstree(repo, treehash, filelist)
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200173 timestamps = {}
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100174 for obj in treeobjects:
Dominik Riebeling6b3c4be2013-06-08 23:56:33 +0200175 get_object(repo, treeobjects[obj], os.path.join(dest, obj))
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200176 for f in timestamp_files:
177 if obj.find(f) == 0:
178 timestamps[obj] = get_file_timestamp(repo, treehash, obj)
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100179
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200180 return [dest, timestamps]
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100181
182
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200183def archive_files(repo, treehash, filelist, basename, tmpfolder="",
184 archive="tbz"):
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100185 '''Archive list of files into tarball.
186 @param repo Path to repository root.
187 @param treehash Hash identifying the tree.
188 @param filelist List of files to archive. All files in the archive if left
189 empty.
190 @param basename Basename (including path) of output file. Will get used as
191 basename inside of the archive as well (i.e. no tarbomb).
192 @param tmpfolder Folder to put intermediate files in. If no folder is given
193 a temporary one will get used.
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200194 @param archive Type of archive to create. Supported values are "tbz" and
195 "7z". The latter requires the 7z binary available in the
196 system's path.
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100197 @return Output filename.
198 '''
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100199
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200200 if tmpfolder == "":
201 temp_remove = True
202 tmpfolder = tempfile.mkdtemp()
203 else:
204 temp_remove = False
205 workfolder = scrape_files(repo, treehash, filelist,
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200206 os.path.join(tmpfolder, basename))[0]
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200207 if basename is "":
208 return ""
Dominik Riebeling885db722012-04-22 21:32:35 +0200209 print("Archiving files from repository")
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200210 if archive == "7z":
211 outfile = basename + ".7z"
212 output = subprocess.Popen(["7z", "a",
213 os.path.join(os.getcwd(), basename + ".7z"), basename],
214 cwd=tmpfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
215 output.communicate()
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200216 elif archive == "tbz":
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200217 outfile = basename + ".tar.bz2"
218 tf = tarfile.open(outfile, "w:bz2")
219 tf.add(workfolder, basename)
220 tf.close()
Dominik Riebeling91cf4a72012-04-29 11:38:23 +0200221 else:
222 print("Files not archived")
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100223 if tmpfolder != workfolder:
224 shutil.rmtree(workfolder)
Dominik Riebelinge9d5f6c2012-04-12 21:08:38 +0200225 if temp_remove:
226 shutil.rmtree(tmpfolder)
Dominik Riebeling909b96f2012-02-05 19:17:53 +0100227 return outfile