svn2svn/svnclient.py

   1 """ SVN client functions """
   2
   3 from shell import run_svn
   4 from errors import EmptySVNLog
   5
   6 import os
   7 import time
   8 import calendar
   9 import operator
  10 import urllib
  11
  12 try:
  13     from xml.etree import cElementTree as ET
  14 except ImportError:
  15     try:
  16         from xml.etree import ElementTree as ET
  17     except ImportError:
  18         try:
  19             import cElementTree as ET
  20         except ImportError:
  21             from elementtree import ElementTree as ET
  22
  23 _identity_table = "".join(map(chr, range(256)))
  24 _forbidden_xml_chars = "".join(
  25     set(map(chr, range(32))) - set('\x09\x0A\x0D')
  26 )
  27
  28 valid_svn_actions = "MARD"   # The list of known SVN action abbr's, from "svn log"
  29
  30 def _strip_forbidden_xml_chars(xml_string):
  31     """
  32     Given an XML string, strips forbidden characters as per the XML spec.
  33     (these are all control characters except 0x9, 0xA and 0xD).
  34     """
  35     return xml_string.translate(_identity_table, _forbidden_xml_chars)
  36
  37 def safe_path(path, rev_number=None):
  38     """
  39     Build a path to pass as a SVN command-line arg.
  40     """
  41     # URL-escape URL's, but leave local WC paths alone
  42     if "://" in path:
  43         path = urllib.quote(path, ":/")
  44     # Add peg revision
  45     if rev_number is not None:
  46         path += "@"+str(rev_number)
  47     # Else, if path already contains an "@", add a trailing "@" to "escape" the earlier "@".
  48     elif "@" in path:
  49         path += "@"
  50     return path
  51
  52 def _svn_date_to_timestamp(svn_date):
  53     """
  54     Parse an SVN date as read from the XML output and return the corresponding
  55     timestamp.
  56     """
  57     # Strip microseconds and timezone (always UTC, hopefully)
  58     # XXX there are various ISO datetime parsing routines out there,
  59     # cf. http://seehuhn.de/comp/pdate
  60     date = svn_date.split('.', 2)[0]
  61     time_tuple = time.strptime(date, "%Y-%m-%dT%H:%M:%S")
  62     return calendar.timegm(time_tuple)
  63
  64 def _parse_svn_info_xml(xml_string):
  65     """
  66     Parse the XML output from an "svn info" command and extract useful information
  67     as a dict.
  68     """
  69     d = {}
  70     xml_string = _strip_forbidden_xml_chars(xml_string)
  71     tree = ET.fromstring(xml_string)
  72     entry = tree.find('.//entry')
  73     d['url'] = entry.find('url').text
  74     d['kind'] = entry.get('kind')
  75     d['revision'] = int(entry.get('revision'))
  76     d['repos_url'] = tree.find('.//repository/root').text
  77     d['repos_uuid'] = tree.find('.//repository/uuid').text
  78     d['last_changed_rev'] = int(tree.find('.//commit').get('revision'))
  79     author_element = tree.find('.//commit/author')
  80     if author_element is not None:
  81         d['last_changed_author'] = author_element.text
  82     d['last_changed_date'] = _svn_date_to_timestamp(tree.find('.//commit/date').text)
  83     # URL-decode "url" and "repos_url" values, since all paths passed
  84     # to run_svn() should be filtered through safe_path() and we don't
  85     # want to *double* URL-encode paths which are constructed used these values.
  86     d['url'] = urllib.unquote(d['url'])
  87     d['repos_url'] = urllib.unquote(d['repos_url'])
  88     return d
  89
  90 def get_kind(svn_repos_url, svn_path, svn_rev, action, paths):
  91     """
  92     Calculate the "kind"-type of a given URL in the SVN repo.
  93     """
  94     # By default, just do a simple "svn info" based on passed-in params.
  95     info_path = svn_path
  96     info_rev =  svn_rev
  97     if action == 'D':
  98         # For deletions, we can't do an "svn info" at this revision.
  99         # Need to trace ancestry backwards.
 100         parents = []
 101         for p in paths:
 102             # Build a list of any copy-from's in this log_entry that we're a child of.
 103             if p['kind'] == 'dir' and p['copyfrom_revision'] and svn_path.startswith(p['path']+"/"):
 104                 parents.append(p['path'])
 105         if parents:
 106             # Use the nearest copy-from'd parent
 107             parents.sort()
 108             parent = parents[len(parents)-1]
 109             for p in paths:
 110                 if parent == p['path']:
 111                     info_path = info_path.replace(p['path'], p['copyfrom_path'])
 112                     info_rev =  p['copyfrom_revision']
 113         else:
 114             # If no parent copy-from's, then we should be able to check this path in
 115             # the preceeding revision.
 116             info_rev -= 1
 117     svn_info = info(svn_repos_url+info_path, info_rev)
 118     return svn_info['kind']
 119
 120 def _parse_svn_log_xml(xml_string):
 121     """
 122     Parse the XML output from an "svn log" command and extract useful information
 123     as a list of dicts (one per log changeset).
 124     """
 125     l = []
 126     xml_string = _strip_forbidden_xml_chars(xml_string)
 127     tree = ET.fromstring(xml_string)
 128     for entry in tree.findall('logentry'):
 129         d = {}
 130         d['revision'] = int(entry.get('revision'))
 131         # Some revisions don't have authors, most notably the first revision
 132         # in a repository.
 133         # logentry nodes targeting directories protected by path-based
 134         # authentication have no child nodes at all. We return an entry
 135         # in that case. Anyway, as it has no path entries, no further
 136         # processing will be made.
 137         author = entry.find('author')
 138         date = entry.find('date')
 139         msg = entry.find('msg')
 140         d['author'] = author is not None and author.text or "No author"
 141         d['date_raw'] = date.text if date is not None else None
 142         d['date'] = _svn_date_to_timestamp(date.text) if date is not None else None
 143         d['message'] = msg is not None and msg.text and msg.text.replace('\r\n', '\n').replace('\n\r', '\n').replace('\r', '\n') or ""
 144         paths = []
 145         for path in entry.findall('.//paths/path'):
 146             copyfrom_rev = path.get('copyfrom-rev')
 147             if copyfrom_rev:
 148                 copyfrom_rev = int(copyfrom_rev)
 149             paths.append({
 150                 'path': path.text,
 151                 'kind': path.get('kind'),
 152                 'action': path.get('action'),
 153                 'copyfrom_path': path.get('copyfrom-path'),
 154                 'copyfrom_revision': copyfrom_rev,
 155             })
 156         # Sort paths (i.e. into hierarchical order), so that process_svn_log_entry()
 157         # can process actions in depth-first order.
 158         d['changed_paths'] = sorted(paths, key=operator.itemgetter('path'))
 159         revprops = []
 160         for prop in entry.findall('.//revprops/property'):
 161             revprops.append({ 'name': prop.get('name'), 'value': prop.text })
 162         d['revprops'] = revprops
 163         l.append(d)
 164     return l
 165
 166 def _parse_svn_status_xml(xml_string, base_dir=None, ignore_externals=False):
 167     """
 168     Parse the XML output from an "svn status" command and extract useful info
 169     as a list of dicts (one per status entry).
 170     """
 171     if base_dir:
 172         base_dir = os.path.normcase(base_dir)
 173     l = []
 174     xml_string = _strip_forbidden_xml_chars(xml_string)
 175     tree = ET.fromstring(xml_string)
 176     for entry in tree.findall('.//entry'):
 177         d = {}
 178         path = entry.get('path')
 179         if base_dir is not None and os.path.normcase(path).startswith(base_dir):
 180             path = path[len(base_dir):].lstrip('/\\')
 181         d['path'] = path
 182         wc_status = entry.find('wc-status')
 183         if wc_status.get('item') == 'external':
 184             if ignore_externals:
 185                 continue
 186         status =   wc_status.get('item')
 187         revision = wc_status.get('revision')
 188         if status == 'external':
 189             d['type'] = 'external'
 190         elif revision is not None:
 191             d['type'] = 'normal'
 192         else:
 193             d['type'] = 'unversioned'
 194         d['status'] =   status
 195         d['revision'] = revision
 196         d['props'] =    wc_status.get('props')
 197         d['copied'] =   wc_status.get('copied')
 198         l.append(d)
 199     return l
 200
 201 def get_rev(svn_url_or_wc, rev_number):
 202     """
 203     Evaluate a given SVN revision pattern, to map it to a discrete rev #.
 204     """
 205     xml_string = run_svn(['info', '--xml', '-r', rev_number, safe_path(svn_url_or_wc, rev_number)], fail_if_stderr=True)
 206     info = _parse_svn_info_xml(xml_string)
 207     return info['revision']
 208
 209 def info(svn_url_or_wc, rev_number=None):
 210     """
 211     Get SVN information for the given URL or working copy, with an optionally
 212     specified revision number.
 213     Returns a dict as created by _parse_svn_info_xml().
 214     """
 215     args = ['info', '--xml']
 216     if rev_number is not None:
 217         args += ["-r", rev_number]
 218     args += [safe_path(svn_url_or_wc, rev_number)]
 219     xml_string = run_svn(args, fail_if_stderr=True)
 220     return _parse_svn_info_xml(xml_string)
 221
 222 def svn_checkout(svn_url, checkout_dir, rev_number=None):
 223     """
 224     Checkout the given URL at an optional revision number.
 225     """
 226     args = ['checkout', '-q']
 227     if rev_number is not None:
 228         args += ['-r', rev_number]
 229     args += [safe_path(svn_url, rev_number), checkout_dir]
 230     return run_svn(args)
 231
 232 def run_svn_log(svn_url_or_wc, rev_start, rev_end, limit, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
 233     """
 234     Fetch up to 'limit' SVN log entries between the given revisions.
 235     """
 236     args = ['log', '--xml']
 237     if stop_on_copy:
 238         args += ['--stop-on-copy']
 239     if get_changed_paths:
 240         args += ['-v']
 241     if get_revprops:
 242         args += ['--with-all-revprops']
 243     args += ['-r', '%s:%s' % (rev_start, rev_end)]
 244     args += ['--limit', str(limit), safe_path(svn_url_or_wc, max(rev_start, rev_end))]
 245     xml_string = run_svn(args)
 246     return _parse_svn_log_xml(xml_string)
 247
 248 def status(svn_wc, quiet=False, non_recursive=False):
 249     """
 250     Get SVN status information about the given working copy.
 251     """
 252     # Ensure proper stripping by canonicalizing the path
 253     svn_wc = os.path.abspath(svn_wc)
 254     args = ['status', '--xml', '--ignore-externals']
 255     if quiet:
 256         args += ['-q']
 257     else:
 258         args += ['-v']
 259     if non_recursive:
 260         args += ['-N']
 261     xml_string = run_svn(args + [safe_path(svn_wc)])
 262     return _parse_svn_status_xml(xml_string, svn_wc, ignore_externals=True)
 263
 264 def get_svn_versioned_files(svn_wc):
 265     """
 266     Get the list of versioned files in the SVN working copy.
 267     """
 268     contents = []
 269     for e in status(svn_wc):
 270         if e['path'] and e['type'] == 'normal':
 271             contents.append(e['path'])
 272     return contents
 273
 274 def get_one_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=False, get_changed_paths=True, get_revprops=False):
 275     """
 276     Get the first SVN log entry in the requested revision range.
 277     """
 278     entries = run_svn_log(svn_url, rev_start, rev_end, 1, stop_on_copy, get_changed_paths, get_revprops)
 279     if entries:
 280         return entries[0]
 281     raise EmptySVNLog("No SVN log for %s between revisions %s and %s" %
 282         (svn_url, rev_start, rev_end))
 283
 284 def get_first_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=True, get_changed_paths=True):
 285     """
 286     Get the first log entry after (or at) the given revision number in an SVN branch.
 287     By default the revision number is set to 0, which will give you the log
 288     entry corresponding to the branch creaction.
 289
 290     NOTE: to know whether the branch creation corresponds to an SVN import or
 291     a copy from another branch, inspect elements of the 'changed_paths' entry
 292     in the returned dictionary.
 293     """
 294     return get_one_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=stop_on_copy, get_changed_paths=get_changed_paths)
 295
 296 def get_last_svn_log_entry(svn_url, rev_start, rev_end, stop_on_copy=True, get_changed_paths=True):
 297     """
 298     Get the last log entry before/at the given revision number in an SVN branch.
 299     By default the revision number is set to HEAD, which will give you the log
 300     entry corresponding to the latest commit in branch.
 301     """
 302     return get_one_svn_log_entry(svn_url, rev_end, rev_start, stop_on_copy=stop_on_copy, get_changed_paths=get_changed_paths)
 303
 304
 305 log_duration_threshold = 10.0
 306 log_min_chunk_length = 10
 307 log_max_chunk_length = 10000
 308
 309 def iter_svn_log_entries(svn_url, first_rev, last_rev, stop_on_copy=False, get_changed_paths=True, get_revprops=False, ancestors=[]):
 310     """
 311     Iterate over SVN log entries between first_rev and last_rev.
 312
 313     This function features chunked log fetching so that it isn't too nasty
 314     to the SVN server if many entries are requested.
 315
 316     NOTE: If *not* passing in the explicit (pre-calculated) 'ancestors' list,
 317     this chunked log fetching *ONLY* works correctly on paths which
 318     are known to have existed unbroken in the SVN repository, e.g. /trunk.
 319     Chunked fetching breaks down if a path existed in earlier, then was
 320     deleted, and later was re-created. For example, if path was created in r5,
 321     then deleted in r1000, and then later re-created in r5000...
 322       svn log --stop-on-copy --limit 1 -r 1:50 "path/to/file"
 323         --> would yield r5, i.e. the _initial_ creation
 324       svn log --stop-on-copy --limit 1 -r 1:HEAD "path/to/file"
 325         --> would yield r5000, i.e. the _re-creation_
 326     Use run/svnreplay.py:find_svn_ancestors() to pass in the 'ancestors' array
 327     so that we can correctly re-trace ancestry here.
 328     """
 329     svn_info = info(svn_url)
 330     svn_repos_url = svn_info['repos_url']
 331     #print "iter_svn_log_entries: %s %s:%s" % (svn_url, first_rev, last_rev)
 332     if last_rev == "HEAD":
 333         last_rev = svn_info['revision']
 334     if int(first_rev) == 1:
 335         start_log = get_first_svn_log_entry(svn_url, first_rev, last_rev, stop_on_copy=stop_on_copy, get_changed_paths=False)
 336         if start_log['revision'] > first_rev:
 337             first_rev = start_log['revision']
 338     #print "first_rev: %s" % first_rev
 339     cur_url = svn_url
 340     cur_rev = first_rev
 341     cur_anc_idx = None
 342     cur_anc_end_rev = None
 343     if ancestors:
 344         #print ancestors
 345         # Crawl ancestry, from oldest to newest
 346         for idx in range(len(ancestors)-1, -1, -1):  # [n-1,...,0]
 347             #print "(pre) Match ancestors[%s]: %s" % (idx, ancestors[idx])
 348             cur_url = svn_repos_url+ancestors[idx]['copyfrom_path']
 349             cur_anc_idx = idx
 350             if first_rev < int(ancestors[idx]['copyfrom_rev']):
 351                 cur_anc_end_rev = int(ancestors[idx]['copyfrom_rev'])
 352                 break
 353         if cur_anc_end_rev is None:
 354             #print "(pre) Match ancestors[0] (final): %s" % (ancestors[0])
 355             cur_anc_idx = -1
 356             cur_url = svn_repos_url+ancestors[0]['path']
 357     chunk_length = log_min_chunk_length
 358     while cur_rev <= last_rev:
 359         #print "cur_rev:%s cur_anc_end_rev:%s cur_anc_idx:%s  %s" % (cur_rev, str(cur_anc_end_rev), cur_anc_idx, cur_url)
 360         if cur_anc_end_rev and cur_rev >= cur_anc_end_rev:
 361             cur_rev = int(ancestors[cur_anc_idx]['revision'])
 362             cur_anc_idx -= 1
 363             if cur_anc_idx >= 0:
 364                 idx = cur_anc_idx
 365                 #print "(loop) Match ancestors[%s]: %s" % (idx, ancestors[idx])
 366                 cur_url = svn_repos_url+ancestors[idx]['copyfrom_path']
 367                 cur_anc_end_rev = int(ancestors[idx]['copyfrom_rev'])
 368             else:
 369                 #print "(loop) Match ancestors[0] (final): %s" % (ancestors[0])
 370                 cur_url = svn_repos_url+ancestors[0]['path']
 371                 cur_anc_end_rev = None
 372         #print "cur_rev:%s cur_anc_end_rev:%s cur_anc_idx:%s  %s" % (cur_rev, str(cur_anc_end_rev), cur_anc_idx, cur_url)
 373         start_t = time.time()
 374         stop_rev = min(last_rev, cur_rev + chunk_length)
 375         stop_rev = min(stop_rev, cur_anc_end_rev) if cur_anc_end_rev else stop_rev
 376         entries = run_svn_log(cur_url, cur_rev, stop_rev, chunk_length,
 377                               stop_on_copy, get_changed_paths, get_revprops)
 378         duration = time.time() - start_t
 379         if entries:
 380             for e in entries:
 381                 if e['revision'] > last_rev:
 382                     break
 383                 # Embed the current URL in the yielded dict, for ancestor cases where
 384                 # we might have followed a copy-from to some non-original URL.
 385                 e['url'] = cur_url
 386                 yield e
 387             if e['revision'] >= last_rev:
 388                 break
 389             cur_rev = int(e['revision'])+1
 390         else:
 391             cur_rev = int(stop_rev)+1
 392         # Adapt chunk length based on measured request duration
 393         if duration < log_duration_threshold:
 394             chunk_length = min(log_max_chunk_length, int(chunk_length * 2.0))
 395         elif duration > log_duration_threshold * 2:
 396             chunk_length = max(log_min_chunk_length, int(chunk_length / 2.0))
 397
 398
 399 _svn_client_version = None
 400
 401 def version():
 402     """
 403     Returns the SVN client version as a tuple.
 404
 405     The returned tuple only contains numbers, non-digits in version string are
 406     silently ignored.
 407     """
 408     global _svn_client_version
 409     if _svn_client_version is None:
 410         raw = run_svn(['--version', '-q']).strip()
 411         _svn_client_version = tuple(map(int, [x for x in raw.split('.')
 412                                               if x.isdigit()]))
 413     return _svn_client_version
 414
 415
 416 def _parse_svn_propget_xml(xml_string):
 417     """
 418     Parse the XML output from an "svn propget" command and extract useful
 419     information as a dict.
 420     """
 421     d = {}
 422     xml_string = _strip_forbidden_xml_chars(xml_string)
 423     tree = ET.fromstring(xml_string)
 424     prop = tree.find('.//property')
 425     d['name'] = prop.get('name')
 426     d['value'] = prop is not None and prop.text and prop.text.replace('\r\n', '\n').replace('\n\r', '\n').replace('\r', '\n') or ""
 427     return d
 428
 429 def _parse_svn_proplist_xml(xml_string):
 430     """
 431     Parse the XML output from an "svn proplist" command and extract list
 432     of property-names.
 433     """
 434     l = []
 435     xml_string = _strip_forbidden_xml_chars(xml_string)
 436     tree = ET.fromstring(xml_string)
 437     for prop in tree.findall('.//property'):
 438         l.append(prop.get('name'))
 439     return l
 440
 441 def propget(svn_url_or_wc, prop_name, rev_number=None):
 442     """
 443     Get the value of a versioned property for the given path.
 444     """
 445     args = ['propget', '--xml']
 446     if rev_number:
 447         args += ['-r', rev_number]
 448     args += [prop_name, safe_path(svn_url_or_wc, rev_number)]
 449     xml_string = run_svn(args)
 450     return _parse_svn_propget_xml(xml_string)
 451
 452 def propget_all(svn_url_or_wc, rev_number=None):
 453     """
 454     Get the values of all versioned properties for the given path.
 455     """
 456     l = {}
 457     args = ['proplist', '--xml']
 458     if rev_number:
 459         args += ['-r', rev_number]
 460     args += [safe_path(svn_url_or_wc, rev_number)]
 461     xml_string = run_svn(args)
 462     props = _parse_svn_proplist_xml(xml_string)
 463     for prop_name in props:
 464         d = propget(svn_url_or_wc, prop_name, rev_number)
 465         l[d['name']] = d['value']
 466     return l
 467
 468 def update(path, non_recursive=False):
 469     """
 470     Update a path in a working-copy.
 471     """
 472     args = ['update', '--ignore-externals']
 473     if non_recursive:
 474         args += ['-N']
 475     args += [safe_path(path)]
 476     run_svn(args)
 477
 478 def remove(path, force=False):
 479     """
 480     Remove a file/directory in a working-copy.
 481     """
 482     args = ['remove']
 483     if force:
 484         args += ['--force']
 485     args += [safe_path(path)]
 486     run_svn(args)
 487
 488 def export(svn_url, rev_number, path, non_recursive=False, force=False):
 489     """
 490     Export a file from a repo to a local path.
 491     """
 492     args = ['export', '--ignore-externals', '-r', rev_number]
 493     if non_recursive:
 494         args += ['-N']
 495     if force:
 496         args += ['--force']
 497     args += [safe_path(svn_url, rev_number), safe_path(path)]
 498     run_svn(args)
 499
 500 def _parse_svn_list_xml(xml_string):
 501     """
 502     Parse the XML output from an "svn list" command and extract list
 503     of contents.
 504     """
 505     l = []
 506     xml_string = _strip_forbidden_xml_chars(xml_string)
 507     tree = ET.fromstring(xml_string)
 508     d = []
 509     for entry in tree.findall('.//entry'):
 510         d = { 'path': entry.find('.//name').text,
 511               'kind': entry.get('kind') }
 512         l.append(d)
 513     return l
 514
 515 def list(svn_url_or_wc, rev_number=None, recursive=False):
 516     """
 517     List the contents of a path as they exist in the repo.
 518     """
 519     args = ['list', '--xml']
 520     if rev_number:
 521         args += ['-r', rev_number]
 522     if recursive:
 523         args += ['-R']
 524     args += [safe_path(svn_url_or_wc, rev_number)]
 525     xml_string = run_svn(args, no_fail=True)
 526     # If svn_url_or_wc is a WC path which hasn't been committed yet,
 527     # 'svn list' won't return a valid XML document. Gracefully short-circuit.
 528     if not "</lists>" in xml_string:
 529         return []
 530     return _parse_svn_list_xml(xml_string)