]> Tony Duckles's Git Repositories (git.nynim.org) - svn2svn.git/blob - svn2svn/svnclient.py
More fixes to iter_svn_log_entries ancestry-handling
[svn2svn.git] / svn2svn / svnclient.py
1 """ SVN client functions """
2
3 from shell import run_svn
4 from errors import EmptySVNLog
5
6 import os
7 import time
8 import calendar
9 import operator
10
11 try:
12 from xml.etree import cElementTree as ET
13 except ImportError:
14 try:
15 from xml.etree import ElementTree as ET
16 except ImportError:
17 try:
18 import cElementTree as ET
19 except ImportError:
20 from elementtree import ElementTree as ET
21
22 _identity_table = "".join(map(chr, range(256)))
23 _forbidden_xml_chars = "".join(
24 set(map(chr, range(32))) - set('\x09\x0A\x0D')
25 )
26
27
28 def strip_forbidden_xml_chars(xml_string):
29 """
30 Given an XML string, strips forbidden characters as per the XML spec.
31 (these are all control characters except 0x9, 0xA and 0xD).
32 """
33 return xml_string.translate(_identity_table, _forbidden_xml_chars)
34
35
36 def svn_date_to_timestamp(svn_date):
37 """
38 Parse an SVN date as read from the XML output and return the corresponding
39 timestamp.
40 """
41 # Strip microseconds and timezone (always UTC, hopefully)
42 # XXX there are various ISO datetime parsing routines out there,
43 # cf. http://seehuhn.de/comp/pdate
44 date = svn_date.split('.', 2)[0]
45 time_tuple = time.strptime(date, "%Y-%m-%dT%H:%M:%S")
46 return calendar.timegm(time_tuple)
47
48 def parse_svn_info_xml(xml_string):
49 """
50 Parse the XML output from an "svn info" command and extract useful information
51 as a dict.
52 """
53 d = {}
54 xml_string = strip_forbidden_xml_chars(xml_string)
55 tree = ET.fromstring(xml_string)
56 entry = tree.find('.//entry')
57 d['url'] = entry.find('url').text
58 d['kind'] = entry.get('kind')
59 d['revision'] = int(entry.get('revision'))
60 d['repos_url'] = tree.find('.//repository/root').text
61 d['repos_uuid'] = tree.find('.//repository/uuid').text
62 d['last_changed_rev'] = int(tree.find('.//commit').get('revision'))
63 author_element = tree.find('.//commit/author')
64 if author_element is not None:
65 d['last_changed_author'] = author_element.text
66 d['last_changed_date'] = svn_date_to_timestamp(tree.find('.//commit/date').text)
67 return d
68
69 def get_kind(svn_repos_url, svn_path, svn_rev, action, paths):
70 """
71 Calculate the "kind"-type of a given URL in the SVN repo.
72 """
73 # By default, just do a simple "svn info" based on passed-in params.
74 info_path = svn_path
75 info_rev = svn_rev
76 if action == 'D':
77 # For deletions, we can't do an "svn info" at this revision.
78 # Need to trace ancestry backwards.
79 parents = []
80 for p in paths:
81 # Build a list of any copy-from's in this log_entry that we're a child of.
82 if p['kind'] == 'dir' and p['copyfrom_revision'] and svn_path.startswith(p['path']+"/"):
83 parents.append(p['path'])
84 if parents:
85 # Use the nearest copy-from'd parent
86 parents.sort()
87 parent = parents[len(parents)-1]
88 for p in paths:
89 if parent == p['path']:
90 info_path = info_path.replace(p['path'], p['copyfrom_path'])
91 info_rev = p['copyfrom_revision']
92 else:
93 # If no parent copy-from's, then we should be able to check this path in
94 # the preceeding revision.
95 info_rev -= 1
96 info = get_svn_info(svn_repos_url+info_path, info_rev)
97 return info['kind']
98
99 def parse_svn_log_xml(xml_string):
100 """
101 Parse the XML output from an "svn log" command and extract useful information
102 as a list of dicts (one per log changeset).
103 """
104 l = []
105 xml_string = strip_forbidden_xml_chars(xml_string)
106 tree = ET.fromstring(xml_string)
107 for entry in tree.findall('logentry'):
108 d = {}
109 d['revision'] = int(entry.get('revision'))
110 # Some revisions don't have authors, most notably the first revision
111 # in a repository.
112 # logentry nodes targeting directories protected by path-based
113 # authentication have no child nodes at all. We return an entry
114 # in that case. Anyway, as it has no path entries, no further
115 # processing will be made.
116 author = entry.find('author')
117 date = entry.find('date')
118 msg = entry.find('msg')
119 d['author'] = author is not None and author.text or "No author"
120 d['date_raw'] = date.text if date is not None else None
121 d['date'] = svn_date_to_timestamp(date.text) if date is not None else None
122 d['message'] = msg is not None and msg.text and msg.text.replace('\r\n', '\n').replace('\n\r', '\n').replace('\r', '\n') or ""
123 paths = []
124 for path in entry.findall('.//paths/path'):
125 copyfrom_rev = path.get('copyfrom-rev')
126 if copyfrom_rev:
127 copyfrom_rev = int(copyfrom_rev)
128 paths.append({
129 'path': path.text,
130 'kind': path.get('kind'),
131 'action': path.get('action'),
132 'copyfrom_path': path.get('copyfrom-path'),
133 'copyfrom_revision': copyfrom_rev,
134 })
135 # Sort paths (i.e. into hierarchical order), so that process_svn_log_entry()
136 # can process actions in depth-first order.
137 d['changed_paths'] = sorted(paths, key=operator.itemgetter('path'))
138 revprops = []
139 for prop in entry.findall('.//revprops/property'):
140 revprops.append({ 'name': prop.get('name'), 'value': prop.text })
141 d['revprops'] = revprops
142 l.append(d)
143 return l
144
145 def parse_svn_status_xml(xml_string, base_dir=None, ignore_externals=False):
146 """
147 Parse the XML output from an "svn status" command and extract useful info
148 as a list of dicts (one per status entry).
149 """
150 if base_dir:
151 base_dir = os.path.normcase(base_dir)
152 l = []
153 xml_string = strip_forbidden_xml_chars(xml_string)
154 tree = ET.fromstring(xml_string)
155 for entry in tree.findall('.//entry'):
156 d = {}
157 path = entry.get('path')
158 if base_dir is not None and os.path.normcase(path).startswith(base_dir):
159 path = path[len(base_dir):].lstrip('/\\')
160 d['path'] = path
161 wc_status = entry.find('wc-status')
162 if wc_status.get('item') == 'external':
163 if ignore_externals:
164 continue
165 status = wc_status.get('item')
166 revision = wc_status.get('revision')
167 if status == 'external':
168 d['type'] = 'external'
169 elif revision is not None:
170 d['type'] = 'normal'
171 else:
172 d['type'] = 'unversioned'
173 d['status'] = status
174 d['revision'] = revision
175 d['props'] = wc_status.get('props')
176 d['copied'] = wc_status.get('copied')
177 l.append(d)
178 return l
179
180 def get_svn_rev(svn_url_or_wc, rev_number):
181 """
182 Evaluate a given SVN revision pattern, to map it to a discrete rev #.
183 """
184 xml_string = run_svn(['info', '--xml', '-r', rev_number, svn_url_or_wc], fail_if_stderr=True)
185 info = parse_svn_info_xml(xml_string)
186 return info['revision']
187
188 def get_svn_info(svn_url_or_wc, rev_number=None):
189 """
190 Get SVN information for the given URL or working copy, with an optionally
191 specified revision number.
192 Returns a dict as created by parse_svn_info_xml().
193 """
194 args = ['info', '--xml']
195 if rev_number is not None:
196 args += ["-r", rev_number, svn_url_or_wc+"@"+str(rev_number)]
197 else:
198 args += [svn_url_or_wc]
199 xml_string = run_svn(args, fail_if_stderr=True)
200 return parse_svn_info_xml(xml_string)
201
202 def svn_checkout(svn_url, checkout_dir, rev_number=None):
203 """
204 Checkout the given URL at an optional revision number.
205 """
206 args = ['checkout', '-q']
207 if rev_number is not None:
208 args += ['-r', rev_number]
209 args += [svn_url, checkout_dir]
210 return run_svn(args)
211
212 def run_svn_log(svn_url_or_wc, rev_start, rev_end, limit, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
213 """
214 Fetch up to 'limit' SVN log entries between the given revisions.
215 """
216 args = ['log', '--xml']
217 if stop_on_copy:
218 args += ['--stop-on-copy']
219 if get_changed_paths:
220 args += ['-v']
221 if get_revprops:
222 args += ['--with-all-revprops']
223 url = str(svn_url_or_wc)
224 args += ['-r', '%s:%s' % (rev_start, rev_end)]
225 if not "@" in svn_url_or_wc:
226 url = "%s@%s" % (svn_url_or_wc, str(max(rev_start, rev_end)))
227 args += ['--limit', str(limit), url]
228 xml_string = run_svn(args)
229 return parse_svn_log_xml(xml_string)
230
231 def get_svn_status(svn_wc, quiet=False, no_recursive=False):
232 """
233 Get SVN status information about the given working copy.
234 """
235 # Ensure proper stripping by canonicalizing the path
236 svn_wc = os.path.abspath(svn_wc)
237 args = ['status', '--xml', '--ignore-externals']
238 if quiet:
239 args += ['-q']
240 else:
241 args += ['-v']
242 if no_recursive:
243 args += ['-N']
244 xml_string = run_svn(args + [svn_wc])
245 return parse_svn_status_xml(xml_string, svn_wc, ignore_externals=True)
246
247 def get_svn_versioned_files(svn_wc):
248 """
249 Get the list of versioned files in the SVN working copy.
250 """
251 contents = []
252 for e in get_svn_status(svn_wc):
253 if e['path'] and e['type'] == 'normal':
254 contents.append(e['path'])
255 return contents
256
257 def get_one_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
258 """
259 Get the first SVN log entry in the requested revision range.
260 """
261 entries = run_svn_log(svn_url, rev_start, rev_end, 1, stop_on_copy, get_changed_paths, get_revprops)
262 if entries:
263 return entries[0]
264 raise EmptySVNLog("No SVN log for %s between revisions %s and %s" %
265 (svn_url, rev_start, rev_end))
266
267 def get_first_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=True, get_changed_paths=True):
268 """
269 Get the first log entry after (or at) the given revision number in an SVN branch.
270 By default the revision number is set to 0, which will give you the log
271 entry corresponding to the branch creaction.
272
273 NOTE: to know whether the branch creation corresponds to an SVN import or
274 a copy from another branch, inspect elements of the 'changed_paths' entry
275 in the returned dictionary.
276 """
277 return get_one_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=stop_on_copy, get_changed_paths=get_changed_paths)
278
279 def get_last_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=True, get_changed_paths=True):
280 """
281 Get the last log entry before/at the given revision number in an SVN branch.
282 By default the revision number is set to HEAD, which will give you the log
283 entry corresponding to the latest commit in branch.
284 """
285 return get_one_svn_log_entry(svn_url, rev_end, rev_start, stop_on_copy=stop_on_copy, get_changed_paths=get_changed_paths)
286
287
288 log_duration_threshold = 10.0
289 log_min_chunk_length = 10
290 log_max_chunk_length = 10000
291
292 def iter_svn_log_entries(svn_url, first_rev, last_rev, stop_on_copy=False, get_changed_paths=True, get_revprops=False, ancestors=[]):
293 """
294 Iterate over SVN log entries between first_rev and last_rev.
295
296 This function features chunked log fetching so that it isn't too nasty
297 to the SVN server if many entries are requested.
298
299 NOTE: If *not* passing in the explicit (pre-calculated) 'ancestors' list,
300 this chunked log fetching *ONLY* works correctly on paths which
301 are known to have existed unbroken in the SVN repository, e.g. /trunk.
302 Chunked fetching breaks down if a path existed in earlier, then was
303 deleted, and later was re-created. For example, if path was created in r5,
304 then deleted in r1000, and then later re-created in r5000...
305 svn log --stop-on-copy --limit 1 -r 1:50 "path/to/file"
306 --> would yield r5, i.e. the _initial_ creation
307 svn log --stop-on-copy --limit 1 -r 1:HEAD "path/to/file"
308 --> would yield r5000, i.e. the _re-creation_
309 Use run/svn2svn.py:find_svn_ancestors() to pass in the 'ancestors' array
310 so that we can correctly re-trace ancestry here.
311 """
312 info = get_svn_info(svn_url)
313 svn_repos_url = info['repos_url']
314 #print "iter_svn_log_entries: %s %s:%s" % (svn_url, first_rev, last_rev)
315 if last_rev == "HEAD":
316 last_rev = info['revision']
317 if int(first_rev) == 1:
318 start_log = get_first_svn_log_entry(svn_url, first_rev, last_rev, stop_on_copy=stop_on_copy, get_changed_paths=False)
319 if start_log['revision'] > first_rev:
320 first_rev = start_log['revision']
321 #print "first_rev: %s" % first_rev
322 cur_url = svn_url
323 cur_rev = first_rev
324 cur_anc_idx = None
325 cur_anc_end_rev = None
326 if ancestors:
327 #print ancestors
328 # Crawl ancestry, from oldest to newest
329 for idx in range(len(ancestors)-1, -1, -1): # [n-1,...,0]
330 #print "(pre) Match ancestors[%s]: %s" % (idx, ancestors[idx])
331 cur_url = svn_repos_url+ancestors[idx]['copyfrom_path']
332 cur_anc_idx = idx
333 if first_rev < int(ancestors[idx]['copyfrom_rev']):
334 cur_anc_end_rev = int(ancestors[idx]['copyfrom_rev'])
335 break
336 if cur_anc_end_rev is None:
337 #print "(pre) Match ancestors[0] (final): %s" % (ancestors[0])
338 cur_anc_idx = -1
339 cur_url = svn_repos_url+ancestors[0]['path']
340 chunk_length = log_min_chunk_length
341 while cur_rev <= last_rev:
342 #print "cur_rev:%s cur_anc_end_rev:%s cur_anc_idx:%s %s" % (cur_rev, str(cur_anc_end_rev), cur_anc_idx, cur_url)
343 if cur_anc_end_rev and cur_rev >= cur_anc_end_rev:
344 cur_rev = int(ancestors[cur_anc_idx]['revision'])
345 cur_anc_idx -= 1
346 if cur_anc_idx >= 0:
347 idx = cur_anc_idx
348 #print "(loop) Match ancestors[%s]: %s" % (idx, ancestors[idx])
349 cur_url = svn_repos_url+ancestors[idx]['copyfrom_path']
350 cur_anc_end_rev = int(ancestors[idx]['copyfrom_rev'])
351 else:
352 #print "(loop) Match ancestors[0] (final): %s" % (ancestors[0])
353 cur_url = svn_repos_url+ancestors[0]['path']
354 cur_anc_end_rev = None
355 #print "cur_rev:%s cur_anc_end_rev:%s cur_anc_idx:%s %s" % (cur_rev, str(cur_anc_end_rev), cur_anc_idx, cur_url)
356 start_t = time.time()
357 stop_rev = min(last_rev, cur_rev + chunk_length)
358 stop_rev = min(stop_rev, cur_anc_end_rev) if cur_anc_end_rev else stop_rev
359 entries = run_svn_log(cur_url, cur_rev, stop_rev, chunk_length,
360 stop_on_copy, get_changed_paths, get_revprops)
361 duration = time.time() - start_t
362 if entries:
363 for e in entries:
364 if e['revision'] > last_rev:
365 break
366 # Embed the current URL in the yielded dict, for ancestor cases where
367 # we might have followed a copy-from to some non-original URL.
368 e['url'] = cur_url
369 yield e
370 if e['revision'] >= last_rev:
371 break
372 cur_rev = int(e['revision'])+1
373 else:
374 cur_rev = int(stop_rev)+1
375 # Adapt chunk length based on measured request duration
376 if duration < log_duration_threshold:
377 chunk_length = min(log_max_chunk_length, int(chunk_length * 2.0))
378 elif duration > log_duration_threshold * 2:
379 chunk_length = max(log_min_chunk_length, int(chunk_length / 2.0))
380
381
382 _svn_client_version = None
383
384 def get_svn_client_version():
385 """
386 Returns the SVN client version as a tuple.
387
388 The returned tuple only contains numbers, non-digits in version string are
389 silently ignored.
390 """
391 global _svn_client_version
392 if _svn_client_version is None:
393 raw = run_svn(['--version', '-q']).strip()
394 _svn_client_version = tuple(map(int, [x for x in raw.split('.')
395 if x.isdigit()]))
396 return _svn_client_version
397
398
399 def parse_svn_propget_xml(xml_string):
400 """
401 Parse the XML output from an "svn propget" command and extract useful
402 information as a dict.
403 """
404 d = {}
405 xml_string = strip_forbidden_xml_chars(xml_string)
406 tree = ET.fromstring(xml_string)
407 prop = tree.find('.//property')
408 d['name'] = prop.get('name')
409 d['value'] = prop is not None and prop.text and prop.text.replace('\r\n', '\n').replace('\n\r', '\n').replace('\r', '\n') or ""
410 return d
411
412 def parse_svn_proplist_xml(xml_string):
413 """
414 Parse the XML output from an "svn proplist" command and extract list
415 of property-names.
416 """
417 l = []
418 xml_string = strip_forbidden_xml_chars(xml_string)
419 tree = ET.fromstring(xml_string)
420 for prop in tree.findall('.//property'):
421 l.append(prop.get('name'))
422 return l
423
424 def get_prop_value(svn_url_or_wc, prop_name, rev_number=None):
425 """
426 Get the value of a versioned property for the given path.
427 """
428 args = ['propget', '--xml']
429 url = str(svn_url_or_wc)
430 if rev_number:
431 args += ['-r', rev_number]
432 if not "@" in svn_url_or_wc:
433 url = "%s@%s" % (svn_url_or_wc, str(rev_number))
434 args += [prop_name, url]
435 xml_string = run_svn(args)
436 return parse_svn_propget_xml(xml_string)
437
438 def get_all_props(svn_url_or_wc, rev_number=None):
439 """
440 Get the values of all versioned properties for the given path.
441 """
442 l = {}
443 args = ['proplist', '--xml']
444 url = str(svn_url_or_wc)
445 if rev_number:
446 args += ['-r', rev_number]
447 if not "@" in svn_url_or_wc:
448 url = "%s@%s" % (svn_url_or_wc, str(rev_number))
449 args += [url]
450 xml_string = run_svn(args)
451 props = parse_svn_proplist_xml(xml_string)
452 for prop_name in props:
453 d = get_prop_value(svn_url_or_wc, prop_name, rev_number)
454 l[d['name']] = d['value']
455 return l