]> Tony Duckles's Git Repositories (git.nynim.org) - svn2svn.git/blob - svn2svn/svnclient.py
Fix copy-from handling and parent-path checking in get_kind()
[svn2svn.git] / svn2svn / svnclient.py
1 """ SVN client functions """
2
3 from shell import run_svn
4 from errors import EmptySVNLog
5
6 import os
7 import time
8 import calendar
9 import operator
10
11 try:
12 from xml.etree import cElementTree as ET
13 except ImportError:
14 try:
15 from xml.etree import ElementTree as ET
16 except ImportError:
17 try:
18 import cElementTree as ET
19 except ImportError:
20 from elementtree import ElementTree as ET
21
22 _identity_table = "".join(map(chr, range(256)))
23 _forbidden_xml_chars = "".join(
24 set(map(chr, range(32))) - set('\x09\x0A\x0D')
25 )
26
27
28 def strip_forbidden_xml_chars(xml_string):
29 """
30 Given an XML string, strips forbidden characters as per the XML spec.
31 (these are all control characters except 0x9, 0xA and 0xD).
32 """
33 return xml_string.translate(_identity_table, _forbidden_xml_chars)
34
35
36 def svn_date_to_timestamp(svn_date):
37 """
38 Parse an SVN date as read from the XML output and return the corresponding
39 timestamp.
40 """
41 # Strip microseconds and timezone (always UTC, hopefully)
42 # XXX there are various ISO datetime parsing routines out there,
43 # cf. http://seehuhn.de/comp/pdate
44 date = svn_date.split('.', 2)[0]
45 time_tuple = time.strptime(date, "%Y-%m-%dT%H:%M:%S")
46 return calendar.timegm(time_tuple)
47
48 def parse_svn_info_xml(xml_string):
49 """
50 Parse the XML output from an "svn info" command and extract useful information
51 as a dict.
52 """
53 d = {}
54 xml_string = strip_forbidden_xml_chars(xml_string)
55 tree = ET.fromstring(xml_string)
56 entry = tree.find('.//entry')
57 d['url'] = entry.find('url').text
58 d['kind'] = entry.get('kind')
59 d['revision'] = int(entry.get('revision'))
60 d['repos_url'] = tree.find('.//repository/root').text
61 d['repos_uuid'] = tree.find('.//repository/uuid').text
62 d['last_changed_rev'] = int(tree.find('.//commit').get('revision'))
63 author_element = tree.find('.//commit/author')
64 if author_element is not None:
65 d['last_changed_author'] = author_element.text
66 d['last_changed_date'] = svn_date_to_timestamp(tree.find('.//commit/date').text)
67 return d
68
69 def _get_kind(svn_repos_url, svn_path, svn_rev, action, paths):
70 """
71 Calculate the "kind"-type of a given URL in the SVN repo.
72 """
73 # By default, just do a simple "svn info" based on passed-in params.
74 info_path = svn_path
75 info_rev = svn_rev
76 if action == 'D':
77 # For deletions, we can't do an "svn info" at this revision.
78 # Need to trace ancestry backwards.
79 parents = []
80 for p in paths:
81 # Build a list of any copy-from's in this log_entry that we're a child of.
82 if p['kind'] == 'dir' and p['copyfrom_revision'] and svn_path.startswith(p['path']+"/"):
83 parents.append(p['path'])
84 if parents:
85 # Use the nearest copy-from'd parent
86 parents.sort()
87 parent = parents[len(parents)-1]
88 for p in paths:
89 if parent == p['path']:
90 info_path = info_path.replace(p['path'], p['copyfrom_path'])
91 info_rev = p['copyfrom_revision']
92 else:
93 # If no parent copy-from's, then we should be able to check this path in
94 # the preceeding revision.
95 info_rev -= 1
96 info = get_svn_info(svn_repos_url+info_path, info_rev)
97 return info['kind']
98
99 def parse_svn_log_xml(xml_string, svn_url_or_wc):
100 """
101 Parse the XML output from an "svn log" command and extract useful information
102 as a list of dicts (one per log changeset).
103 """
104 l = []
105 info = {}
106 svn_repos_url = ""
107 xml_string = strip_forbidden_xml_chars(xml_string)
108 tree = ET.fromstring(xml_string)
109 for entry in tree.findall('logentry'):
110 d = {}
111 d['revision'] = int(entry.get('revision'))
112 if not info:
113 info = get_svn_info(svn_url_or_wc, d['revision'])
114 svn_repos_url = info['repos_url']
115 # Some revisions don't have authors, most notably the first revision
116 # in a repository.
117 # logentry nodes targeting directories protected by path-based
118 # authentication have no child nodes at all. We return an entry
119 # in that case. Anyway, as it has no path entries, no further
120 # processing will be made.
121 author = entry.find('author')
122 date = entry.find('date')
123 msg = entry.find('msg')
124 d['author'] = author is not None and author.text or "No author"
125 if date is not None:
126 d['date'] = svn_date_to_timestamp(date.text)
127 else:
128 d['date'] = None
129 d['message'] = msg is not None and msg.text and msg.text.replace('\r\n', '\n').replace('\n\r', '\n').replace('\r', '\n') or ""
130 paths = []
131 for path in entry.findall('.//paths/path'):
132 copyfrom_rev = path.get('copyfrom-rev')
133 if copyfrom_rev:
134 copyfrom_rev = int(copyfrom_rev)
135 cur_path = path.text
136 kind = path.get('kind')
137 action = path.get('action')
138 if kind == "":
139 kind = _get_kind(svn_repos_url, cur_path, d['revision'], action, paths)
140 assert (kind == 'file') or (kind == 'dir')
141 paths.append({
142 'path': path.text,
143 'kind': kind,
144 'action': action,
145 'copyfrom_path': path.get('copyfrom-path'),
146 'copyfrom_revision': copyfrom_rev,
147 })
148 # Sort paths (i.e. into hierarchical order), so that process_svn_log_entry()
149 # can process actions in depth-first order.
150 d['changed_paths'] = sorted(paths, key=operator.itemgetter('path'))
151 revprops = []
152 for prop in entry.findall('.//revprops/property'):
153 revprops.append({ 'name': prop.get('name'), 'value': prop.text })
154 d['revprops'] = revprops
155 l.append(d)
156 return l
157
158 def parse_svn_status_xml(xml_string, base_dir=None, ignore_externals=False):
159 """
160 Parse the XML output from an "svn status" command and extract useful info
161 as a list of dicts (one per status entry).
162 """
163 if base_dir:
164 base_dir = os.path.normcase(base_dir)
165 l = []
166 xml_string = strip_forbidden_xml_chars(xml_string)
167 tree = ET.fromstring(xml_string)
168 for entry in tree.findall('.//entry'):
169 d = {}
170 path = entry.get('path')
171 if base_dir is not None:
172 assert os.path.normcase(path).startswith(base_dir)
173 path = path[len(base_dir):].lstrip('/\\')
174 d['path'] = path
175 wc_status = entry.find('wc-status')
176 if wc_status.get('item') == 'external':
177 if ignore_externals:
178 continue
179 status = wc_status.get('item')
180 revision = wc_status.get('revision')
181 if status == 'external':
182 d['type'] = 'external'
183 elif revision is not None:
184 d['type'] = 'normal'
185 else:
186 d['type'] = 'unversioned'
187 d['status'] = status
188 d['revision'] = revision
189 d['props'] = wc_status.get('props')
190 d['copied'] = wc_status.get('copied')
191 l.append(d)
192 return l
193
194 def get_svn_rev(svn_url_or_wc, rev_number):
195 """
196 Evaluate a given SVN revision pattern, to map it to a discrete rev #.
197 """
198 xml_string = run_svn(['info', '--xml', '-r', rev_number, svn_url_or_wc], fail_if_stderr=True)
199 info = parse_svn_info_xml(xml_string)
200 return info['revision']
201
202 def get_svn_info(svn_url_or_wc, rev_number=None):
203 """
204 Get SVN information for the given URL or working copy, with an optionally
205 specified revision number.
206 Returns a dict as created by parse_svn_info_xml().
207 """
208 args = ['info', '--xml']
209 if rev_number is not None:
210 args += ["-r", rev_number, svn_url_or_wc+"@"+str(rev_number)]
211 else:
212 args += [svn_url_or_wc]
213 xml_string = run_svn(args, fail_if_stderr=True)
214 return parse_svn_info_xml(xml_string)
215
216 def svn_checkout(svn_url, checkout_dir, rev_number=None):
217 """
218 Checkout the given URL at an optional revision number.
219 """
220 args = ['checkout', '-q']
221 if rev_number is not None:
222 args += ['-r', rev_number]
223 args += [svn_url, checkout_dir]
224 return run_svn(args)
225
226 def run_svn_log(svn_url_or_wc, rev_start, rev_end, limit, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
227 """
228 Fetch up to 'limit' SVN log entries between the given revisions.
229 """
230 args = ['log', '--xml']
231 if stop_on_copy:
232 args += ['--stop-on-copy']
233 if get_changed_paths:
234 args += ['-v']
235 if get_revprops:
236 args += ['--with-all-revprops']
237 url = str(svn_url_or_wc)
238 args += ['-r', '%s:%s' % (rev_start, rev_end)]
239 if not "@" in svn_url_or_wc:
240 url = "%s@%s" % (svn_url_or_wc, str(max(rev_start, rev_end)))
241 args += ['--limit', str(limit), url]
242 xml_string = run_svn(args)
243 return parse_svn_log_xml(xml_string, svn_url_or_wc)
244
245 def get_svn_status(svn_wc, quiet=False, no_recursive=False):
246 """
247 Get SVN status information about the given working copy.
248 """
249 # Ensure proper stripping by canonicalizing the path
250 svn_wc = os.path.abspath(svn_wc)
251 args = ['status', '--xml', '--ignore-externals']
252 if quiet:
253 args += ['-q']
254 else:
255 args += ['-v']
256 if no_recursive:
257 args += ['-N']
258 xml_string = run_svn(args + [svn_wc])
259 return parse_svn_status_xml(xml_string, svn_wc, ignore_externals=True)
260
261 def get_svn_versioned_files(svn_wc):
262 """
263 Get the list of versioned files in the SVN working copy.
264 """
265 contents = []
266 for e in get_svn_status(svn_wc):
267 if e['path'] and e['type'] == 'normal':
268 contents.append(e['path'])
269 return contents
270
271 def get_one_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
272 """
273 Get the first SVN log entry in the requested revision range.
274 """
275 entries = run_svn_log(svn_url, rev_start, rev_end, 1, stop_on_copy, get_changed_paths, get_revprops)
276 if entries:
277 return entries[0]
278 raise EmptySVNLog("No SVN log for %s between revisions %s and %s" %
279 (svn_url, rev_start, rev_end))
280
281 def get_first_svn_log_entry(svn_url, rev_start, rev_end, get_changed_paths=True):
282 """
283 Get the first log entry after (or at) the given revision number in an SVN branch.
284 By default the revision number is set to 0, which will give you the log
285 entry corresponding to the branch creaction.
286
287 NOTE: to know whether the branch creation corresponds to an SVN import or
288 a copy from another branch, inspect elements of the 'changed_paths' entry
289 in the returned dictionary.
290 """
291 return get_one_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=True, get_changed_paths=True)
292
293 def get_last_svn_log_entry(svn_url, rev_start, rev_end, get_changed_paths=True):
294 """
295 Get the last log entry before/at the given revision number in an SVN branch.
296 By default the revision number is set to HEAD, which will give you the log
297 entry corresponding to the latest commit in branch.
298 """
299 return get_one_svn_log_entry(svn_url, rev_end, rev_start, stop_on_copy=True, get_changed_paths=True)
300
301
302 log_duration_threshold = 10.0
303 log_min_chunk_length = 10
304 log_max_chunk_length = 10000
305
306 def iter_svn_log_entries(svn_url, first_rev, last_rev, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
307 """
308 Iterate over SVN log entries between first_rev and last_rev.
309
310 This function features chunked log fetching so that it isn't too nasty
311 to the SVN server if many entries are requested.
312
313 NOTE: This chunked log fetching *ONLY* works correctly on paths which
314 are known to have existed unbroken in the SVN repository, e.g. /trunk.
315 Chunked fetching breaks down if a path existed in earlier, then was
316 deleted, and later was re-created. For example, if path was created in r5,
317 then deleted in r1000, and then later re-created in r5000...
318 svn log --stop-on-copy --limit 1 -r 1:50 "path/to/file"
319 --> would yield r5, i.e. the _initial_ creation
320 svn log --stop-on-copy --limit 1 -r 1:HEAD "path/to/file"
321 --> would yield r5000, i.e. the _re-creation_
322 In theory this might work if we always search "backwards", searching from
323 the end going forward rather than forward going to the end...
324 """
325 if last_rev == "HEAD":
326 info = get_svn_info(svn_url)
327 last_rev = info['revision']
328 cur_rev = first_rev
329 chunk_length = log_min_chunk_length
330 while cur_rev <= last_rev:
331 start_t = time.time()
332 stop_rev = min(last_rev, cur_rev + chunk_length)
333 entries = run_svn_log(svn_url, cur_rev, stop_rev, chunk_length,
334 stop_on_copy, get_changed_paths, get_revprops)
335 duration = time.time() - start_t
336 if entries:
337 for e in entries:
338 if e['revision'] > last_rev:
339 break
340 yield e
341 if e['revision'] >= last_rev:
342 break
343 cur_rev = e['revision']+1
344 else:
345 cur_rev = int(stop_rev)+1
346 # Adapt chunk length based on measured request duration
347 if duration < log_duration_threshold:
348 chunk_length = min(log_max_chunk_length, int(chunk_length * 2.0))
349 elif duration > log_duration_threshold * 2:
350 chunk_length = max(log_min_chunk_length, int(chunk_length / 2.0))
351
352
353 _svn_client_version = None
354
355 def get_svn_client_version():
356 """
357 Returns the SVN client version as a tuple.
358
359 The returned tuple only contains numbers, non-digits in version string are
360 silently ignored.
361 """
362 global _svn_client_version
363 if _svn_client_version is None:
364 raw = run_svn(['--version', '-q']).strip()
365 _svn_client_version = tuple(map(int, [x for x in raw.split('.')
366 if x.isdigit()]))
367 return _svn_client_version