From 2919cf0d03b37050c6624d97547653d1fffa033d Mon Sep 17 00:00:00 2001 From: Brian Harring Date: Sat, 13 Oct 2012 17:49:44 -0700 Subject: import of content; note rcsparse has had my old http://cvs2svn.tigris.org/nonav/issues/showattachment.cgi/64/rcparse_redundant_work.patch patch applied. --- .gitignore | 2 + config | 588 ++++++++ cvs2svn_lib/__init__.py | 18 + cvs2svn_lib/apple_single_filter.py | 292 ++++ cvs2svn_lib/artifact.py | 59 + cvs2svn_lib/artifact_manager.py | 256 ++++ cvs2svn_lib/bzr_run_options.py | 175 +++ cvs2svn_lib/changeset.py | 269 ++++ cvs2svn_lib/changeset_database.py | 70 + cvs2svn_lib/changeset_graph.py | 456 ++++++ cvs2svn_lib/changeset_graph_link.py | 149 ++ cvs2svn_lib/changeset_graph_node.py | 50 + cvs2svn_lib/check_dependencies_pass.py | 144 ++ cvs2svn_lib/checkout_internal.py | 778 +++++++++++ cvs2svn_lib/collect_data.py | 1431 +++++++++++++++++++ cvs2svn_lib/common.py | 409 ++++++ cvs2svn_lib/config.py | 221 +++ cvs2svn_lib/context.py | 93 ++ cvs2svn_lib/cvs_file.py | 287 ++++ cvs2svn_lib/cvs_file_database.py | 75 + cvs2svn_lib/cvs_file_items.py | 1075 +++++++++++++++ cvs2svn_lib/cvs_item.py | 901 ++++++++++++ cvs2svn_lib/cvs_item_database.py | 248 ++++ cvs2svn_lib/cvs_revision_manager.py | 85 ++ cvs2svn_lib/database.py | 322 +++++ cvs2svn_lib/dumpfile_delegate.py | 510 +++++++ cvs2svn_lib/fill_source.py | 192 +++ cvs2svn_lib/fulltext_revision_recorder.py | 127 ++ cvs2svn_lib/git_output_option.py | 658 +++++++++ cvs2svn_lib/git_revision_recorder.py | 114 ++ cvs2svn_lib/git_run_options.py | 274 ++++ cvs2svn_lib/key_generator.py | 45 + cvs2svn_lib/log.py | 174 +++ cvs2svn_lib/main.py | 117 ++ cvs2svn_lib/man_writer.py | 197 +++ cvs2svn_lib/metadata.py | 26 + cvs2svn_lib/metadata_database.py | 102 ++ cvs2svn_lib/openings_closings.py | 236 ++++ cvs2svn_lib/output_option.py | 85 ++ cvs2svn_lib/pass_manager.py | 215 +++ cvs2svn_lib/passes.py | 1837 +++++++++++++++++++++++++ cvs2svn_lib/persistence_manager.py | 106 ++ cvs2svn_lib/process.py | 116 ++ cvs2svn_lib/project.py | 219 +++ cvs2svn_lib/property_setters.py | 385 ++++++ cvs2svn_lib/rcs_revision_manager.py | 51 + cvs2svn_lib/rcs_stream.py | 149 ++ cvs2svn_lib/record_table.py | 399 ++++++ cvs2svn_lib/repository_delegate.py | 98 ++ cvs2svn_lib/repository_mirror.py | 897 ++++++++++++ cvs2svn_lib/revision_manager.py | 189 +++ cvs2svn_lib/run_options.py | 1035 ++++++++++++++ cvs2svn_lib/serializer.py | 146 ++ cvs2svn_lib/stats_keeper.py | 189 +++ cvs2svn_lib/stdout_delegate.py | 107 ++ cvs2svn_lib/svn_commit.py | 381 +++++ cvs2svn_lib/svn_commit_creator.py | 217 +++ cvs2svn_lib/svn_commit_item.py | 50 + cvs2svn_lib/svn_output_option.py | 753 ++++++++++ cvs2svn_lib/svn_repository_delegate.py | 121 ++ cvs2svn_lib/svn_revision_range.py | 171 +++ cvs2svn_lib/svn_run_options.py | 543 ++++++++ cvs2svn_lib/symbol.py | 246 ++++ cvs2svn_lib/symbol_database.py | 68 + cvs2svn_lib/symbol_statistics.py | 521 +++++++ cvs2svn_lib/symbol_strategy.py | 685 +++++++++ cvs2svn_lib/symbol_transform.py | 236 ++++ cvs2svn_lib/time_range.py | 44 + cvs2svn_lib/version.py | 27 + cvs2svn_rcsparse/__init__.py | 26 + cvs2svn_rcsparse/common.py | 324 +++++ cvs2svn_rcsparse/debug.py | 122 ++ cvs2svn_rcsparse/default.py | 172 +++ cvs2svn_rcsparse/parse_rcs_file.py | 73 + cvs2svn_rcsparse/rcparse_redundant_work.patch | 99 ++ cvs2svn_rcsparse/run-tests.py | 73 + cvs2svn_rcsparse/texttools.py | 348 +++++ 77 files changed, 22748 insertions(+) create mode 100644 .gitignore create mode 100644 config create mode 100644 cvs2svn_lib/__init__.py create mode 100644 cvs2svn_lib/apple_single_filter.py create mode 100644 cvs2svn_lib/artifact.py create mode 100644 cvs2svn_lib/artifact_manager.py create mode 100644 cvs2svn_lib/bzr_run_options.py create mode 100644 cvs2svn_lib/changeset.py create mode 100644 cvs2svn_lib/changeset_database.py create mode 100644 cvs2svn_lib/changeset_graph.py create mode 100644 cvs2svn_lib/changeset_graph_link.py create mode 100644 cvs2svn_lib/changeset_graph_node.py create mode 100644 cvs2svn_lib/check_dependencies_pass.py create mode 100644 cvs2svn_lib/checkout_internal.py create mode 100644 cvs2svn_lib/collect_data.py create mode 100644 cvs2svn_lib/common.py create mode 100644 cvs2svn_lib/config.py create mode 100644 cvs2svn_lib/context.py create mode 100644 cvs2svn_lib/cvs_file.py create mode 100644 cvs2svn_lib/cvs_file_database.py create mode 100644 cvs2svn_lib/cvs_file_items.py create mode 100644 cvs2svn_lib/cvs_item.py create mode 100644 cvs2svn_lib/cvs_item_database.py create mode 100644 cvs2svn_lib/cvs_revision_manager.py create mode 100644 cvs2svn_lib/database.py create mode 100644 cvs2svn_lib/dumpfile_delegate.py create mode 100644 cvs2svn_lib/fill_source.py create mode 100644 cvs2svn_lib/fulltext_revision_recorder.py create mode 100644 cvs2svn_lib/git_output_option.py create mode 100644 cvs2svn_lib/git_revision_recorder.py create mode 100644 cvs2svn_lib/git_run_options.py create mode 100644 cvs2svn_lib/key_generator.py create mode 100644 cvs2svn_lib/log.py create mode 100644 cvs2svn_lib/main.py create mode 100644 cvs2svn_lib/man_writer.py create mode 100644 cvs2svn_lib/metadata.py create mode 100644 cvs2svn_lib/metadata_database.py create mode 100644 cvs2svn_lib/openings_closings.py create mode 100644 cvs2svn_lib/output_option.py create mode 100644 cvs2svn_lib/pass_manager.py create mode 100644 cvs2svn_lib/passes.py create mode 100644 cvs2svn_lib/persistence_manager.py create mode 100644 cvs2svn_lib/process.py create mode 100644 cvs2svn_lib/project.py create mode 100644 cvs2svn_lib/property_setters.py create mode 100644 cvs2svn_lib/rcs_revision_manager.py create mode 100644 cvs2svn_lib/rcs_stream.py create mode 100644 cvs2svn_lib/record_table.py create mode 100644 cvs2svn_lib/repository_delegate.py create mode 100644 cvs2svn_lib/repository_mirror.py create mode 100644 cvs2svn_lib/revision_manager.py create mode 100644 cvs2svn_lib/run_options.py create mode 100644 cvs2svn_lib/serializer.py create mode 100644 cvs2svn_lib/stats_keeper.py create mode 100644 cvs2svn_lib/stdout_delegate.py create mode 100644 cvs2svn_lib/svn_commit.py create mode 100644 cvs2svn_lib/svn_commit_creator.py create mode 100644 cvs2svn_lib/svn_commit_item.py create mode 100644 cvs2svn_lib/svn_output_option.py create mode 100644 cvs2svn_lib/svn_repository_delegate.py create mode 100644 cvs2svn_lib/svn_revision_range.py create mode 100644 cvs2svn_lib/svn_run_options.py create mode 100644 cvs2svn_lib/symbol.py create mode 100644 cvs2svn_lib/symbol_database.py create mode 100644 cvs2svn_lib/symbol_statistics.py create mode 100644 cvs2svn_lib/symbol_strategy.py create mode 100644 cvs2svn_lib/symbol_transform.py create mode 100644 cvs2svn_lib/time_range.py create mode 100644 cvs2svn_lib/version.py create mode 100644 cvs2svn_rcsparse/__init__.py create mode 100644 cvs2svn_rcsparse/common.py create mode 100644 cvs2svn_rcsparse/debug.py create mode 100644 cvs2svn_rcsparse/default.py create mode 100644 cvs2svn_rcsparse/parse_rcs_file.py create mode 100644 cvs2svn_rcsparse/rcparse_redundant_work.patch create mode 100644 cvs2svn_rcsparse/run-tests.py create mode 100644 cvs2svn_rcsparse/texttools.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b5efc7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +cvs-repo +output diff --git a/config b/config new file mode 100644 index 0000000..94c17d7 --- /dev/null +++ b/config @@ -0,0 +1,588 @@ +# (Be in -*- mode: python; coding: utf-8 -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +# ##################### +# ## PLEASE READ ME! ## +# ##################### +# +# This is a template for an options file that can be used to configure +# cvs2svn to convert to git rather than to Subversion. See +# www/cvs2git.html and www/cvs2svn.html for general information, and +# see the comments in this file for information about what options are +# available and how they can be set. +# +# The program that is run to convert from CVS to git is called +# cvs2git. Run it with the --options option, passing it this file +# like this: +# +# cvs2git --options=cvs2git-example.options +# +# The output of cvs2git is a blob file and a dump file that can be +# loaded into git using the "git fast-import" command. Please read +# www/cvs2git.html for more information. +# +# Many options do not have defaults, so it is easier to copy this file +# and modify what you need rather than creating a new options file +# from scratch. This file is in Python syntax, but you don't need to +# know Python to modify it. But if you *do* know Python, then you +# will be happy to know that you can use arbitary Python constructs to +# do fancy configuration tricks. +# +# But please be aware of the following: +# +# * In many places, leading whitespace is significant in Python (it is +# used instead of curly braces to group statements together). +# Therefore, if you don't know what you are doing, it is best to +# leave the whitespace as it is. +# +# * In normal strings, Python treats a backslash ("\") as an escape +# character. Therefore, if you want to specify a string that +# contains a backslash, you need either to escape the backslash with +# another backslash ("\\"), or use a "raw string", as in one if the +# following equivalent examples: +# +# ctx.sort_executable = 'c:\\windows\\system32\\sort.exe' +# ctx.sort_executable = r'c:\windows\system32\sort.exe' +# +# See http://docs.python.org/tutorial/introduction.html#strings for +# more information. +# +# Two identifiers will have been defined before this file is executed, +# and can be used freely within this file: +# +# ctx -- a Ctx object (see cvs2svn_lib/context.py), which holds +# many configuration options +# +# run_options -- an instance of the GitRunOptions class (see +# cvs2svn_lib/git_run_options.py), which holds some variables +# governing how cvs2git is run + + +# Import some modules that are used in setting the options: +import re + +from cvs2svn_lib import config +from cvs2svn_lib import changeset_database +from cvs2svn_lib.common import CVSTextDecoder +from cvs2svn_lib.log import Log +from cvs2svn_lib.project import Project +from cvs2svn_lib.git_revision_recorder import GitRevisionRecorder +from cvs2svn_lib.git_output_option import GitRevisionMarkWriter +from cvs2svn_lib.git_output_option import GitOutputOption +from cvs2svn_lib.revision_manager import NullRevisionRecorder +from cvs2svn_lib.revision_manager import NullRevisionExcluder +from cvs2svn_lib.fulltext_revision_recorder \ + import SimpleFulltextRevisionRecorderAdapter +from cvs2svn_lib.rcs_revision_manager import RCSRevisionReader +from cvs2svn_lib.cvs_revision_manager import CVSRevisionReader +from cvs2svn_lib.checkout_internal import InternalRevisionRecorder +from cvs2svn_lib.checkout_internal import InternalRevisionExcluder +from cvs2svn_lib.checkout_internal import InternalRevisionReader +from cvs2svn_lib.symbol_strategy import AllBranchRule +from cvs2svn_lib.symbol_strategy import AllTagRule +from cvs2svn_lib.symbol_strategy import BranchIfCommitsRule +from cvs2svn_lib.symbol_strategy import ExcludeRegexpStrategyRule +from cvs2svn_lib.symbol_strategy import ForceBranchRegexpStrategyRule +from cvs2svn_lib.symbol_strategy import ForceTagRegexpStrategyRule +from cvs2svn_lib.symbol_strategy import ExcludeTrivialImportBranchRule +from cvs2svn_lib.symbol_strategy import ExcludeVendorBranchRule +from cvs2svn_lib.symbol_strategy import HeuristicStrategyRule +from cvs2svn_lib.symbol_strategy import UnambiguousUsageRule +from cvs2svn_lib.symbol_strategy import HeuristicPreferredParentRule +from cvs2svn_lib.symbol_strategy import SymbolHintsFileRule +from cvs2svn_lib.symbol_transform import ReplaceSubstringsSymbolTransform +from cvs2svn_lib.symbol_transform import RegexpSymbolTransform +from cvs2svn_lib.symbol_transform import IgnoreSymbolTransform +from cvs2svn_lib.symbol_transform import NormalizePathsSymbolTransform +from cvs2svn_lib.property_setters import AutoPropsPropertySetter +from cvs2svn_lib.property_setters import CVSBinaryFileDefaultMimeTypeSetter +from cvs2svn_lib.property_setters import CVSBinaryFileEOLStyleSetter +from cvs2svn_lib.property_setters import CVSRevisionNumberSetter +from cvs2svn_lib.property_setters import DefaultEOLStyleSetter +from cvs2svn_lib.property_setters import EOLStyleFromMimeTypeSetter +from cvs2svn_lib.property_setters import ExecutablePropertySetter +from cvs2svn_lib.property_setters import KeywordsPropertySetter +from cvs2svn_lib.property_setters import MimeMapper +from cvs2svn_lib.property_setters import SVNBinaryFileKeywordsPropertySetter + +# To choose the level of logging output, uncomment one of the +# following lines: +#Log().log_level = Log.WARN +#Log().log_level = Log.QUIET +#Log().log_level = Log.NORMAL +#Log().log_level = Log.VERBOSE +Log().log_level = Log.DEBUG + + +# During CollectRevsPass, cvs2git records the contents of file +# revisions into a "blob" file in git-fast-import format. This option +# configures that process: +ctx.revision_recorder = SimpleFulltextRevisionRecorderAdapter( + # The following option specifies how the revision contents of the RCS + # files should be read. + # + # RCSRevisionReader uses RCS's "co" program to extract the revision + # contents of the RCS files during CollectRevsPass. The constructor + # argument specifies how to invoke the "co" executable. + # + # CVSRevisionReader uses the "cvs" program to extract the revision + # contents out of the RCS files during OutputPass. This option is + # considerably slower than RCSRevisionReader because "cvs" is + # considerably slower than "co". However, it works in some situations + # where RCSRevisionReader fails; see the HTML documentation of the + # "--use-cvs" option for details. The constructor argument specifies + # how to invoke the "co" executable. + # + # Uncomment one of the two following lines: + RCSRevisionReader(co_executable=r'co'), + #CVSRevisionReader(cvs_executable=r'cvs'), + + # The file in which to write the git-fast-import stream that + # contains the file revision contents: + GitRevisionRecorder('cvs2svn-tmp/git-blob.dat'), + ) + +# cvs2git does not need to keep track of what revisions will be +# excluded, so leave this option unchanged: +ctx.revision_excluder = NullRevisionExcluder() + +# cvs2git doesn't need a revision reader because OutputPass only +# refers to blobs that were output during CollectRevsPass, so leave +# this option set to None. +ctx.revision_reader = None + +# Set the name (and optionally the path) of some other executables +# required by cvs2svn: +ctx.sort_executable = r'sort' + +# Change the following line to True if the conversion should only +# include the trunk of the repository (i.e., all branches and tags +# should be omitted from the conversion): +ctx.trunk_only = False + +# How to convert CVS author names, log messages, and filenames to +# Unicode. The first argument to CVSTextDecoder is a list of encoders +# that are tried in order in 'strict' mode until one of them succeeds. +# If none of those succeeds, then fallback_encoder (if it is +# specified) is used in lossy 'replace' mode. Setting a fallback +# encoder ensures that the encoder always succeeds, but it can cause +# information loss. +ctx.cvs_author_decoder = CVSTextDecoder( + [ + #'latin1', + #'utf8', + 'ascii', + ], + fallback_encoding='latin1' + ) +ctx.cvs_log_decoder = CVSTextDecoder( + [ + #'latin1', + #'utf8', + 'ascii', + ], + fallback_encoding='latin1' + ) +# You might want to be especially strict when converting filenames to +# Unicode (e.g., maybe not specify a fallback_encoding). +ctx.cvs_filename_decoder = CVSTextDecoder( + [ + #'latin1', + #'utf8', + 'ascii', + ], + #fallback_encoding='ascii' + ) + +# Template for the commit message to be used for initial project +# commits. +ctx.initial_project_commit_message = ( + 'Standard project directories initialized by cvs2svn.' + ) + +# Template for the commit message to be used for post commits, in +# which modifications to a vendor branch are copied back to trunk. +# This message can use '%(revnum)d' to include the SVN revision number +# of the revision that included the change to the vendor branch +# (admittedly rather pointless in a cvs2git conversion). +ctx.post_commit_message = ( + 'This commit was generated by cvs2svn to track changes on a CVS ' + 'vendor branch.' + ) + +# Template for the commit message to be used for commits in which +# symbols are created. This message can use '%(symbol_type)d' to +# include the type of the symbol ('branch' or 'tag') or +# '%(symbol_name)' to include the name of the symbol. +ctx.symbol_commit_message = ( + "This commit was manufactured by cvs2svn to create %(symbol_type)s " + "'%(symbol_name)s'." + ) + +# Some CVS clients for MacOS store resource fork data into CVS along +# with the file contents itself by wrapping it all up in a container +# format called "AppleSingle". Subversion currently does not support +# MacOS resource forks. Nevertheless, sometimes the resource fork +# information is not necessary and can be discarded. Set the +# following option to True if you would like cvs2svn to identify files +# whose contents are encoded in AppleSingle format, and discard all +# but the data fork for such files before committing them to +# Subversion. (Please note that AppleSingle contents are identified +# by the AppleSingle magic number as the first four bytes of the file. +# This check is not failproof, so only set this option if you think +# you need it.) +ctx.decode_apple_single = False + +# This option can be set to the name of a filename to which are stored +# statistics and conversion decisions about the CVS symbols. +ctx.symbol_info_filename = None +#ctx.symbol_info_filename = 'symbol-info.txt' + +# cvs2svn uses "symbol strategy rules" to help decide how to handle +# CVS symbols. The rules in a project's symbol_strategy_rules are +# applied in order, and each rule is allowed to modify the symbol. +# The result (after each of the rules has been applied) is used for +# the conversion. +# +# 1. A CVS symbol might be used as a tag in one file and as a branch +# in another file. cvs2svn has to decide whether to convert such a +# symbol as a tag or as a branch. cvs2svn uses a series of +# heuristic rules to decide how to convert a symbol. The user can +# override the default rules for specific symbols or symbols +# matching regular expressions. +# +# 2. cvs2svn is also capable of excluding symbols from the conversion +# (provided no other symbols depend on them. +# +# 3. CVS does not record unambiguously the line of development from +# which a symbol sprouted. cvs2svn uses a heuristic to choose a +# symbol's "preferred parents". +# +# The standard branch/tag/exclude StrategyRules do not change a symbol +# that has already been processed by an earlier rule, so in effect the +# first matching rule is the one that is used. + +global_symbol_strategy_rules = [ + # It is possible to specify manually exactly how symbols should be + # converted and what line of development should be used as the + # preferred parent. To do so, create a file containing the symbol + # hints and enable the following option. + # + # The format of the hints file is described in the documentation + # for the --symbol-hints command-line option. The file output by + # the --write-symbol-info (i.e., ctx.symbol_info_filename) option + # is in the same format. The simplest way to use this option is + # to run the conversion through CollateSymbolsPass with + # --write-symbol-info option, copy the symbol info and edit it to + # create a hints file, then re-start the conversion at + # CollateSymbolsPass with this option enabled. + #SymbolHintsFileRule('symbol-hints.txt'), + + # To force all symbols matching a regular expression to be + # converted as branches, add rules like the following: + #ForceBranchRegexpStrategyRule(r'branch.*'), + + # To force all symbols matching a regular expression to be + # converted as tags, add rules like the following: + #ForceTagRegexpStrategyRule(r'tag.*'), + + # To force all symbols matching a regular expression to be + # excluded from the conversion, add rules like the following: + #ExcludeRegexpStrategyRule(r'unknown-.*'), + + # Sometimes people use "cvs import" to get their own source code + # into CVS. This practice creates a vendor branch 1.1.1 and + # imports the code onto the vendor branch as 1.1.1.1, then copies + # the same content to the trunk as version 1.1. Normally, such + # vendor branches are useless and they complicate the SVN history + # unnecessarily. The following rule excludes any branches that + # only existed as a vendor branch with a single import (leaving + # only the 1.1 revision). If you want to retain such branches, + # comment out the following line. (Please note that this rule + # does not exclude vendor *tags*, as they are not so easy to + # identify.) + ExcludeTrivialImportBranchRule(), + + # To exclude all vendor branches (branches that had "cvs import"s + # on them bug no other kinds of commits), uncomment the following + # line: + #ExcludeVendorBranchRule(), + + # Usually you want this rule, to convert unambiguous symbols + # (symbols that were only ever used as tags or only ever used as + # branches in CVS) the same way they were used in CVS: + UnambiguousUsageRule(), + + # If there was ever a commit on a symbol, then it cannot be + # converted as a tag. This rule causes all such symbols to be + # converted as branches. If you would like to resolve such + # ambiguities manually, comment out the following line: + BranchIfCommitsRule(), + + # Last in the list can be a catch-all rule that is used for + # symbols that were not matched by any of the more specific rules + # above. (Assuming that BranchIfCommitsRule() was included above, + # then the symbols that are still indeterminate at this point can + # sensibly be converted as branches or tags.) Include at most one + # of these lines. If none of these catch-all rules are included, + # then the presence of any ambiguous symbols (that haven't been + # disambiguated above) is an error: + + # Convert ambiguous symbols based on whether they were used more + # often as branches or as tags: + HeuristicStrategyRule(), + # Convert all ambiguous symbols as branches: + #AllBranchRule(), + # Convert all ambiguous symbols as tags: + #AllTagRule(), + + # The last rule is here to choose the preferred parent of branches + # and tags, that is, the line of development from which the symbol + # sprouts. + HeuristicPreferredParentRule(), + ] + +# Specify a username to be used for commits for which CVS doesn't +# record the original author (for example, the creation of a branch). +# This should be a simple (unix-style) username, but it can be +# translated into a git-style name by the author_transforms map. +ctx.username = 'cvs2svn' + +# ctx.svn_property_setters contains a list of rules used to set the +# svn properties on files in the converted archive. For each file, +# the rules are tried one by one. Any rule can add or suppress one or +# more svn properties. Typically the rules will not overwrite +# properties set by a previous rule (though they are free to do so). +# +# Obviously, SVN properties per se are not interesting for a cvs2git +# conversion, but some of these properties have side-effects that do +# affect the git output. FIXME: Document this in more detail. +ctx.svn_property_setters.extend([ + # To read auto-props rules from a file, uncomment the following line + # and specify a filename. The boolean argument specifies whether + # case should be ignored when matching filenames to the filename + # patterns found in the auto-props file: + #AutoPropsPropertySetter( + # r'/home/username/.subversion/config', + # ignore_case=True, + # ), + + # To read mime types from a file, uncomment the following line and + # specify a filename: + #MimeMapper(r'/etc/mime.types'), + + # Omit the svn:eol-style property from any files that are listed + # as binary (i.e., mode '-kb') in CVS: + CVSBinaryFileEOLStyleSetter(), + + # If the file is binary and its svn:mime-type property is not yet + # set, set svn:mime-type to 'application/octet-stream'. + CVSBinaryFileDefaultMimeTypeSetter(), + + # To try to determine the eol-style from the mime type, uncomment + # the following line: + #EOLStyleFromMimeTypeSetter(), + + # Choose one of the following lines to set the default + # svn:eol-style if none of the above rules applied. The argument + # is the svn:eol-style that should be applied, or None if no + # svn:eol-style should be set (i.e., the file should be treated as + # binary). + # + # The default is to treat all files as binary unless one of the + # previous rules has determined otherwise, because this is the + # safest approach. However, if you have been diligent about + # marking binary files with -kb in CVS and/or you have used the + # above rules to definitely mark binary files as binary, then you + # might prefer to use 'native' as the default, as it is usually + # the most convenient setting for text files. Other possible + # options: 'CRLF', 'CR', 'LF'. + DefaultEOLStyleSetter(None), + #DefaultEOLStyleSetter('native'), + + # Prevent svn:keywords from being set on files that have + # svn:eol-style unset. + SVNBinaryFileKeywordsPropertySetter(), + + # If svn:keywords has not been set yet, set it based on the file's + # CVS mode: + KeywordsPropertySetter(config.SVN_KEYWORDS_VALUE), + + # Set the svn:executable flag on any files that are marked in CVS as + # being executable: + ExecutablePropertySetter(), + + ]) + +# The directory to use for temporary files: +ctx.tmpdir = r'cvs2svn-tmp' + +# To skip the cleanup of temporary files, uncomment the following +# option: +#ctx.skip_cleanup = True + + +# In CVS, it is perfectly possible to make a single commit that +# affects more than one project or more than one branch of a single +# project. Subversion also allows such commits. Therefore, by +# default, when cvs2svn sees what looks like a cross-project or +# cross-branch CVS commit, it converts it into a +# cross-project/cross-branch Subversion commit. +# +# However, other tools and SCMs have trouble representing +# cross-project or cross-branch commits. (For example, Trac's Revtree +# plugin, http://www.trac-hacks.org/wiki/RevtreePlugin is confused by +# such commits.) Therefore, we provide the following two options to +# allow cross-project/cross-branch commits to be suppressed. + +# cvs2git only supports single-project conversions (multiple-project +# conversions wouldn't really make sense for git anyway). So this +# option must be set to False: +ctx.cross_project_commits = False + +# git itself doesn't allow commits that affect more than one branch, +# so this option must be set to False: +ctx.cross_branch_commits = False + +# cvs2git does not yet handle translating .cvsignore files into +# .gitignore files, so by default, the .cvsignore files are included +# in the conversion output. If you would like to omit the .cvsignore +# files from the output, set this option to False: +ctx.keep_cvsignore = True + +# By default, it is a fatal error for a CVS ",v" file to appear both +# inside and outside of an "Attic" subdirectory (this should never +# happen, but frequently occurs due to botched repository +# administration). If you would like to retain both versions of such +# files, change the following option to True, and the attic version of +# the file will be written to a subdirectory called "Attic" in the +# output repository: +ctx.retain_conflicting_attic_files = False + +# CVS uses unix login names as author names whereas git requires +# author names to be of the form "foo ". The default is to set +# the git author to "cvsauthor ". author_transforms can be +# used to map cvsauthor names (e.g., "jrandom") to a true name and +# email address (e.g., "J. Random " for the +# example shown). All values should be either Unicode strings (i.e., +# with "u" as a prefix) or 8-bit strings in the utf-8 encoding. +# Please substitute your own project's usernames here to use with the +# author_transforms option of GitOutputOption below. +author_transforms={ + 'jrandom' : ('J. Random', 'jrandom@example.com'), + 'mhagger' : ('Michael Haggerty', 'mhagger@alum.mit.edu'), + 'brane' : (u'Branko Čibej', 'brane@xbc.nu'), + 'ringstrom' : ('Tobias Ringström', 'tobias@ringstrom.mine.nu'), + 'dionisos' : (u'Erik Hülsmann', 'e.huelsmann@gmx.net'), + + # This one will be used for commits for which CVS doesn't record + # the original author, as explained above. + 'cvs2svn' : ('cvs2svn', 'admin@example.com'), + } + +# This is the main option that causes cvs2svn to output to a +# "fastimport"-format dumpfile rather than to Subversion: +ctx.output_option = GitOutputOption( + # The file in which to write the git-fast-import stream that + # contains the changesets and branch/tag information: + 'cvs2svn-tmp/git-dump.dat', + + # The blobs will be written via the revision recorder, so in + # OutputPass we only have to emit references to the blob marks: + GitRevisionMarkWriter(), + + # This option can be set to an integer to limit the number of + # revisions that are merged with the main parent in any commit. + # For git output, this can be set to None (unlimited), though due + # to the limitations of other tools you might want to set it to a + # smaller number (e.g., 16). For Mercurial output, this should be + # set to 1. + max_merges=None, + #max_merges=1, + + # Optional map from CVS author names to git author names: + author_transforms=author_transforms, + ) + +# Change this option to True to turn on profiling of cvs2svn (for +# debugging purposes): +run_options.profiling = False + + +# Should CVSItem -> Changeset database files be memory mapped? In +# some tests, using memory mapping speeded up the overall conversion +# by about 5%. But this option can cause the conversion to fail with +# an out of memory error if the conversion computer runs out of +# virtual address space (e.g., when running a very large conversion on +# a 32-bit operating system). Therefore it is disabled by default. +# Uncomment the following line to allow these database files to be +# memory mapped. +changeset_database.use_mmap_for_cvs_item_to_changeset_table = True + +# Now set the project to be converted to git. cvs2git only supports +# single-project conversions, so this method must only be called +# once: +run_options.set_project( + # The filesystem path to the part of the CVS repository (*not* a + # CVS working copy) that should be converted. This may be a + # subdirectory (i.e., a module) within a larger CVS repository. + r'cvs-repo', + + # A list of symbol transformations that can be used to rename + # symbols in this project. + symbol_transforms=[ + # Use IgnoreSymbolTransforms like the following to completely + # ignore symbols matching a regular expression when parsing + # the CVS repository, for example to avoid warnings about + # branches with two names and to choose the preferred name. + # It is *not* recommended to use this instead of + # ExcludeRegexpStrategyRule; though more efficient, + # IgnoreSymbolTransforms are less flexible and don't exclude + # branches correctly. The argument is a Python-style regular + # expression that has to match the *whole* CVS symbol name: + #IgnoreSymbolTransform(r'nightly-build-tag-.*') + + # RegexpSymbolTransforms transform symbols textually using a + # regular expression. The first argument is a Python regular + # expression pattern and the second is a replacement pattern. + # The pattern is matched against each symbol name. If it + # matches the whole symbol name, then the symbol name is + # replaced with the corresponding replacement text. The + # replacement can include substitution patterns (e.g., r'\1' + # or r'\g'). Typically you will want to use raw strings + # (strings with a preceding 'r', like shown in the examples) + # for the regexp and its replacement to avoid backslash + # substitution within those strings. + #RegexpSymbolTransform(r'release-(\d+)_(\d+)', + # r'release-\1.\2'), + #RegexpSymbolTransform(r'release-(\d+)_(\d+)_(\d+)', + # r'release-\1.\2.\3'), + + # Simple 1:1 character replacements can also be done. The + # following transform, which converts backslashes into forward + # slashes, should usually be included: + ReplaceSubstringsSymbolTransform('\\','/'), + + # This last rule eliminates leading, trailing, and repeated + # slashes within the output symbol names: + NormalizePathsSymbolTransform(), + ], + + # See the definition of global_symbol_strategy_rules above for a + # description of this option: + symbol_strategy_rules=global_symbol_strategy_rules, + ) + diff --git a/cvs2svn_lib/__init__.py b/cvs2svn_lib/__init__.py new file mode 100644 index 0000000..838d4c6 --- /dev/null +++ b/cvs2svn_lib/__init__.py @@ -0,0 +1,18 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This package contains modules that support cvs2svn.""" + diff --git a/cvs2svn_lib/apple_single_filter.py b/cvs2svn_lib/apple_single_filter.py new file mode 100644 index 0000000..95fa9cb --- /dev/null +++ b/cvs2svn_lib/apple_single_filter.py @@ -0,0 +1,292 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""A stream filter for extracting the data fork from AppleSingle data. + +Some Macintosh CVS clients store resource fork data along with the +contents of the file (called the data fork) by encoding both in an +'AppleSingle' data stream before storing them to CVS. This file +contains a stream filter for extracting the data fork from such data +streams. (Any other forks are discarded.) + +See the following for some random information about this format and +how it is used by Macintosh CVS clients: + + http://users.phg-online.de/tk/netatalk/doc/Apple/v1/ + http://rfc.net/rfc1740.html + http://ximbiot.com/cvs/cvshome/cyclic/cvs/dev-mac.html + http://www.maccvs.org/faq.html#resfiles + http://www.heilancoo.net/MacCVSClient/MacCVSClientDoc/storage-formats.html + +""" + + +import struct +from cStringIO import StringIO + + +class AppleSingleFormatError(IOError): + """The stream was not in correct AppleSingle format.""" + + pass + + +class AppleSingleIncorrectMagicError(AppleSingleFormatError): + """The file didn't start with the correct magic number.""" + + def __init__(self, data_read, eof): + AppleSingleFormatError.__init__(self) + self.data_read = data_read + self.eof = eof + + +class AppleSingleEOFError(AppleSingleFormatError): + """EOF was reached where AppleSingle doesn't allow it.""" + + pass + + +class AppleSingleFilter(object): + """A stream that reads the data fork from an AppleSingle stream. + + If the constructor discovers that the file is not a legitimate + AppleSingle stream, then it raises an AppleSingleFormatError. In + the special case that the magic number is incorrect, it raises + AppleSingleIncorrectMagicError with data_read set to the data that + have been read so far from the input stream. (This allows the + caller the option to fallback to treating the input stream as a + normal binary data stream.)""" + + # The header is: + # + # Magic number 4 bytes + # Version number 4 bytes + # File system or filler 16 bytes + # Number of entries 2 bytes + magic_struct = '>i' + magic_len = struct.calcsize(magic_struct) + + # The part of the header after the magic number: + rest_of_header_struct = '>i16sH' + rest_of_header_len = struct.calcsize(rest_of_header_struct) + + # Each entry is: + # + # Entry ID 4 bytes + # Offset 4 bytes + # Length 4 bytes + entry_struct = '>iii' + entry_len = struct.calcsize(entry_struct) + + apple_single_magic = 0x00051600 + apple_single_version_1 = 0x00010000 + apple_single_version_2 = 0x00020000 + apple_single_filler = '\0' * 16 + + apple_single_data_fork_entry_id = 1 + + def __init__(self, stream): + self.stream = stream + + # Check for the AppleSingle magic number: + s = self._read_exactly(self.magic_len) + if len(s) < self.magic_len: + raise AppleSingleIncorrectMagicError(s, True) + + (magic,) = struct.unpack(self.magic_struct, s) + if magic != self.apple_single_magic: + raise AppleSingleIncorrectMagicError(s, False) + + # Read the rest of the header: + s = self._read_exactly(self.rest_of_header_len) + if len(s) < self.rest_of_header_len: + raise AppleSingleEOFError('AppleSingle header incomplete') + + (version, filler, num_entries) = \ + struct.unpack(self.rest_of_header_struct, s) + + if version == self.apple_single_version_1: + self._prepare_apple_single_v1_file(num_entries) + elif version == self.apple_single_version_2: + if filler != self.apple_single_filler: + raise AppleSingleFormatError('Incorrect filler') + self._prepare_apple_single_v2_file(num_entries) + else: + raise AppleSingleFormatError('Unknown AppleSingle version') + + def _read_exactly(self, size): + """Read and return exactly SIZE characters from the stream. + + This method is to deal with the fact that stream.read(size) is + allowed to return less than size characters. If EOF is reached + before SIZE characters have been read, return the characters that + have been read so far.""" + + retval = [] + length_remaining = size + while length_remaining > 0: + s = self.stream.read(length_remaining) + if not s: + break + retval.append(s) + length_remaining -= len(s) + + return ''.join(retval) + + def _prepare_apple_single_file(self, num_entries): + entries = self._read_exactly(num_entries * self.entry_len) + if len(entries) < num_entries * self.entry_len: + raise AppleSingleEOFError('Incomplete entries list') + + for i in range(num_entries): + entry = entries[i * self.entry_len : (i + 1) * self.entry_len] + (entry_id, offset, length) = struct.unpack(self.entry_struct, entry) + if entry_id == self.apple_single_data_fork_entry_id: + break + else: + raise AppleSingleFormatError('No data fork found') + + # The data fork is located at [offset : offset + length]. Read up + # to the start of the data: + n = offset - self.magic_len - self.rest_of_header_len - len(entries) + if n < 0: + raise AppleSingleFormatError('Invalid offset to AppleSingle data fork') + + max_chunk_size = 65536 + while n > 0: + s = self.stream.read(min(n, max_chunk_size)) + if not s: + raise AppleSingleEOFError( + 'Offset to AppleSingle data fork past end of file' + ) + n -= len(s) + + self.length_remaining = length + + def _prepare_apple_single_v1_file(self, num_entries): + self._prepare_apple_single_file(num_entries) + + def _prepare_apple_single_v2_file(self, num_entries): + self._prepare_apple_single_file(num_entries) + + def read(self, size=-1): + if size == 0 or self.length_remaining == 0: + return '' + elif size < 0: + s = self._read_exactly(self.length_remaining) + if len(s) < self.length_remaining: + raise AppleSingleEOFError('AppleSingle data fork truncated') + self.length_remaining = 0 + return s + else: + # The length of this read is allowed to be shorter than the + # requested size: + s = self.stream.read(min(size, self.length_remaining)) + if not s: + raise AppleSingleEOFError() + self.length_remaining -= len(s) + return s + + def close(self): + self.stream.close() + self.stream = None + + +class CompoundStream(object): + """A stream that reads from a series of streams, one after the other.""" + + def __init__(self, *streams): + self.streams = list(streams) + self.stream_index = 0 + + def read(self, size=-1): + if size < 0: + retval = [] + while self.stream_index < len(self.streams): + retval.append(self.streams[self.stream_index].read()) + self.stream_index += 1 + return ''.join(retval) + else: + while self.stream_index < len(self.streams): + s = self.streams[self.stream_index].read(size) + if s: + # This may not be the full size requested, but that is OK: + return s + else: + # That stream was empty; proceed to the next stream: + self.stream_index += 1 + + # No streams are left: + return '' + + def close(self): + for stream in self.streams: + stream.close() + self.streams = None + + +def get_maybe_apple_single_stream(stream): + """Treat STREAM as AppleSingle if possible; otherwise treat it literally. + + If STREAM is in AppleSingle format, then return a stream that will + output the data fork of the original stream. Otherwise, return a + stream that will output the original file contents literally. + + Be careful not to read from STREAM after it has already hit EOF.""" + + try: + return AppleSingleFilter(stream) + except AppleSingleIncorrectMagicError, e: + # This is OK; the file is not AppleSingle, so we read it normally: + string_io = StringIO(e.data_read) + if e.eof: + # The original stream already reached EOF, so the part already + # read contains the complete file contents: + return string_io + else: + # The stream needs to output the part already read followed by + # whatever hasn't been read of the original stream: + return CompoundStream(string_io, stream) + + +if __name__ == '__main__': + # For fun and testing, allow use of this file as a pipe if it is + # invoked as a script. Specifically, if stdin is in AppleSingle + # format, then output only its data fork; otherwise, output it + # unchanged. + # + # This might not work on systems where sys.stdin is opened in text + # mode. + # + # Remember to set PYTHONPATH to point to the main cvs2svn directory. + + import sys + + #CHUNK_SIZE = -1 + CHUNK_SIZE = 100 + + f = get_maybe_apple_single_stream(sys.stdin) + + if CHUNK_SIZE < 0: + sys.stdout.write(f.read()) + else: + while True: + s = f.read(CHUNK_SIZE) + if not s: + break + sys.stdout.write(s) + + diff --git a/cvs2svn_lib/artifact.py b/cvs2svn_lib/artifact.py new file mode 100644 index 0000000..99d6945 --- /dev/null +++ b/cvs2svn_lib/artifact.py @@ -0,0 +1,59 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module defines Artifact types to be used with an ArtifactManager.""" + + +import os + +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.log import Log + + +class Artifact(object): + """An object that is created, used across passes, then cleaned up.""" + + def __init__(self): + # The set of passes that need this artifact. This field is + # maintained by ArtifactManager. + self._passes_needed = set() + + def cleanup(self): + """This artifact is no longer needed; clean it up.""" + + pass + + +class TempFile(Artifact): + """A temporary file that can be used across cvs2svn passes.""" + + def __init__(self, basename): + Artifact.__init__(self) + self.basename = basename + + def _get_filename(self): + return Ctx().get_temp_filename(self.basename) + + filename = property(_get_filename) + + def cleanup(self): + Log().verbose("Deleting", self.filename) + os.unlink(self.filename) + + def __str__(self): + return 'Temporary file %r' % (self.filename,) + + diff --git a/cvs2svn_lib/artifact_manager.py b/cvs2svn_lib/artifact_manager.py new file mode 100644 index 0000000..08f0ec7 --- /dev/null +++ b/cvs2svn_lib/artifact_manager.py @@ -0,0 +1,256 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module manages the artifacts produced by conversion passes.""" + + +from cvs2svn_lib.log import Log +from cvs2svn_lib.artifact import TempFile + + +class ArtifactNotActiveError(Exception): + """An artifact was requested when no passes that have registered + that they need it are active.""" + + def __init__(self, artifact_name): + Exception.__init__( + self, 'Artifact %s is not currently active' % artifact_name) + + +class ArtifactManager: + """Manage artifacts that are created by one pass but needed by others. + + This class is responsible for cleaning up artifacts once they are no + longer needed. The trick is that cvs2svn can be run pass by pass, + so not all passes might be executed during a specific program run. + + To use this class: + + - Call artifact_manager.set_artifact(name, artifact) once for each + known artifact. + + - Call artifact_manager.creates(which_pass, artifact) to indicate + that WHICH_PASS is the pass that creates ARTIFACT. + + - Call artifact_manager.uses(which_pass, artifact) to indicate that + WHICH_PASS needs to use ARTIFACT. + + There are also helper methods register_temp_file(), + register_artifact_needed(), and register_temp_file_needed() which + combine some useful operations. + + Then, in pass order: + + - Call pass_skipped() for any passes that were already executed + during a previous cvs2svn run. + + - Call pass_started() when a pass is about to start execution. + + - If a pass that has been started will be continued during the next + program run, then call pass_continued(). + + - If a pass that has been started finishes execution, call + pass_done(), to allow any artifacts that won't be needed anymore + to be cleaned up. + + - Call pass_deferred() for any passes that have been deferred to a + future cvs2svn run. + + Finally: + + - Call check_clean() to verify that all artifacts have been + accounted for.""" + + def __init__(self): + # A map { artifact_name : artifact } of known artifacts. + self._artifacts = { } + + # A map { pass : set_of_artifacts }, where set_of_artifacts is a + # set of artifacts needed by the pass. + self._pass_needs = { } + + # A set of passes that are currently being executed. + self._active_passes = set() + + def set_artifact(self, name, artifact): + """Add ARTIFACT to the list of artifacts that we manage. + + Store it under NAME.""" + + assert name not in self._artifacts + self._artifacts[name] = artifact + + def get_artifact(self, name): + """Return the artifact with the specified name. + + If the artifact does not currently exist, raise a KeyError. If it + is not registered as being needed by one of the active passes, + raise an ArtifactNotActiveError.""" + + artifact = self._artifacts[name] + for active_pass in self._active_passes: + if artifact in self._pass_needs[active_pass]: + # OK + return artifact + else: + raise ArtifactNotActiveError(name) + + def creates(self, which_pass, artifact): + """Register that WHICH_PASS creates ARTIFACT. + + ARTIFACT must already have been registered.""" + + # An artifact is automatically "needed" in the pass in which it is + # created: + self.uses(which_pass, artifact) + + def uses(self, which_pass, artifact): + """Register that WHICH_PASS uses ARTIFACT. + + ARTIFACT must already have been registered.""" + + artifact._passes_needed.add(which_pass) + if which_pass in self._pass_needs: + self._pass_needs[which_pass].add(artifact) + else: + self._pass_needs[which_pass] = set([artifact]) + + def register_temp_file(self, basename, which_pass): + """Register a temporary file with base name BASENAME as an artifact. + + Return the filename of the temporary file.""" + + artifact = TempFile(basename) + self.set_artifact(basename, artifact) + self.creates(which_pass, artifact) + + def get_temp_file(self, basename): + """Return the filename of the temporary file with the specified BASENAME. + + If the temporary file is not an existing, registered TempFile, + raise a KeyError.""" + + return self.get_artifact(basename).filename + + def register_artifact_needed(self, artifact_name, which_pass): + """Register that WHICH_PASS uses the artifact named ARTIFACT_NAME. + + An artifact with this name must already have been registered.""" + + artifact = self._artifacts[artifact_name] + artifact._passes_needed.add(which_pass) + if which_pass in self._pass_needs: + self._pass_needs[which_pass].add(artifact) + else: + self._pass_needs[which_pass] = set([artifact,]) + + def register_temp_file_needed(self, basename, which_pass): + """Register that a temporary file is needed by WHICH_PASS. + + Register that the temporary file with base name BASENAME is needed + by WHICH_PASS.""" + + self.register_artifact_needed(basename, which_pass) + + def _unregister_artifacts(self, which_pass): + """Unregister any artifacts that were needed for WHICH_PASS. + + Return a list of artifacts that are no longer needed at all.""" + + try: + artifacts = list(self._pass_needs[which_pass]) + except KeyError: + # No artifacts were needed for that pass: + return [] + + del self._pass_needs[which_pass] + + unneeded_artifacts = [] + for artifact in artifacts: + artifact._passes_needed.remove(which_pass) + if not artifact._passes_needed: + unneeded_artifacts.append(artifact) + + return unneeded_artifacts + + def pass_skipped(self, which_pass): + """WHICH_PASS was executed during a previous cvs2svn run. + + Its artifacts were created then, and any artifacts that would + normally be cleaned up after this pass have already been cleaned + up.""" + + self._unregister_artifacts(which_pass) + + def pass_started(self, which_pass): + """WHICH_PASS is starting.""" + + self._active_passes.add(which_pass) + + def pass_continued(self, which_pass): + """WHICH_PASS will be continued during the next program run. + + WHICH_PASS, which has already been started, will be continued + during the next program run. Unregister any artifacts that would + be cleaned up at the end of WHICH_PASS without actually cleaning + them up.""" + + self._active_passes.remove(which_pass) + self._unregister_artifacts(which_pass) + + def pass_done(self, which_pass, skip_cleanup): + """WHICH_PASS is done. + + Clean up all artifacts that are no longer needed. If SKIP_CLEANUP + is True, then just do the bookkeeping without actually calling + artifact.cleanup().""" + + self._active_passes.remove(which_pass) + artifacts = self._unregister_artifacts(which_pass) + if not skip_cleanup: + for artifact in artifacts: + artifact.cleanup() + + def pass_deferred(self, which_pass): + """WHICH_PASS is being deferred until a future cvs2svn run. + + Unregister any artifacts that would be cleaned up during + WHICH_PASS.""" + + self._unregister_artifacts(which_pass) + + def check_clean(self): + """All passes have been processed. + + Output a warning messages if all artifacts have not been accounted + for. (This is mainly a consistency check, that no artifacts were + registered under nonexistent passes.)""" + + unclean_artifacts = [ + str(artifact) + for artifact in self._artifacts.values() + if artifact._passes_needed] + + if unclean_artifacts: + Log().warn( + 'INTERNAL: The following artifacts were not cleaned up:\n %s\n' + % ('\n '.join(unclean_artifacts))) + + +# The default ArtifactManager instance: +artifact_manager = ArtifactManager() + + diff --git a/cvs2svn_lib/bzr_run_options.py b/cvs2svn_lib/bzr_run_options.py new file mode 100644 index 0000000..5332dff --- /dev/null +++ b/cvs2svn_lib/bzr_run_options.py @@ -0,0 +1,175 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module manages cvs2bzr run options.""" + + +import sys +import datetime +import codecs + +from cvs2svn_lib.version import VERSION +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.run_options import not_both +from cvs2svn_lib.run_options import RunOptions +from cvs2svn_lib.run_options import ContextOption +from cvs2svn_lib.run_options import IncompatibleOption +from cvs2svn_lib.run_options import authors +from cvs2svn_lib.man_writer import ManWriter +from cvs2svn_lib.rcs_revision_manager import RCSRevisionReader +from cvs2svn_lib.cvs_revision_manager import CVSRevisionReader +from cvs2svn_lib.git_run_options import GitRunOptions +from cvs2svn_lib.git_output_option import GitRevisionInlineWriter +from cvs2svn_lib.git_output_option import GitOutputOption +from cvs2svn_lib.revision_manager import NullRevisionRecorder +from cvs2svn_lib.revision_manager import NullRevisionExcluder + + +short_desc = 'convert a cvs repository into a Bazaar repository' + +synopsis = """\ +.B cvs2bzr +[\\fIOPTION\\fR]... \\fIOUTPUT-OPTIONS CVS-REPOS-PATH\\fR +.br +.B cvs2bzr +[\\fIOPTION\\fR]... \\fI--options=PATH\\fR +""" + +description="""\ +Convert a CVS repository into a Bazaar repository, including history. + +""" +long_desc = """\ +Create a new Bazaar repository based on the version history stored in a +CVS repository. Each CVS commit will be mirrored in the Bazaar +repository, including such information as date of commit and id of the +committer. +.P +The output of this program is a "fast-import dumpfile", which +can be loaded into a Bazaar repository using the Bazaar FastImport +Plugin, available from https://launchpad.net/bzr-fastimport. + +.P +\\fICVS-REPOS-PATH\\fR is the filesystem path of the part of the CVS +repository that you want to convert. This path doesn't have to be the +top level directory of a CVS repository; it can point at a project +within a repository, in which case only that project will be +converted. This path or one of its parent directories has to contain +a subdirectory called CVSROOT (though the CVSROOT directory can be +empty). +.P +It is not possible directly to convert a CVS repository to which you +only have remote access, but the FAQ describes tools that may be used +to create a local copy of a remote CVS repository. +""" + +files = """\ +A directory called \\fIcvs2svn-tmp\\fR (or the directory specified by +\\fB--tmpdir\\fR) is used as scratch space for temporary data files. +""" + +see_also = [ + ('cvs', '1'), + ('bzr', '1'), + ] + + +class BzrRunOptions(GitRunOptions): + + def get_description(self): + return description + + def _get_output_options_group(self): + group = RunOptions._get_output_options_group(self) + + group.add_option(IncompatibleOption( + '--dumpfile', type='string', + action='store', + help='path to which the data should be written', + man_help=( + 'Write the blobs and revision data to \\fIpath\\fR.' + ), + metavar='PATH', + )) + group.add_option(ContextOption( + '--dry-run', + action='store_true', + help=( + 'do not create any output; just print what would happen.' + ), + man_help=( + 'Do not create any output; just print what would happen.' + ), + )) + + return group + + def callback_manpage(self, option, opt_str, value, parser): + f = codecs.getwriter('utf_8')(sys.stdout) + ManWriter( + parser, + section='1', + date=datetime.date.today(), + source='Version %s' % (VERSION,), + manual='User Commands', + short_desc=short_desc, + synopsis=synopsis, + long_desc=long_desc, + files=files, + authors=authors, + see_also=see_also, + ).write_manpage(f) + sys.exit(0) + + def process_io_options(self): + """Process input/output options. + + Process options related to extracting data from the CVS repository + and writing to a Bazaar-friendly fast-import file.""" + + ctx = Ctx() + options = self.options + + not_both(options.use_rcs, '--use-rcs', + options.use_cvs, '--use-cvs') + + if options.use_rcs: + revision_reader = RCSRevisionReader( + co_executable=options.co_executable + ) + else: + # --use-cvs is the default: + revision_reader = CVSRevisionReader( + cvs_executable=options.cvs_executable + ) + + if not ctx.dry_run and not options.dumpfile: + raise FatalError("must pass '--dry-run' or '--dumpfile' option.") + + ctx.revision_recorder = NullRevisionRecorder() + ctx.revision_excluder = NullRevisionExcluder() + ctx.revision_reader = None + + ctx.output_option = GitOutputOption( + options.dumpfile, + GitRevisionInlineWriter(revision_reader), + max_merges=None, + # Optional map from CVS author names to bzr author names: + author_transforms={}, # FIXME + ) + + diff --git a/cvs2svn_lib/changeset.py b/cvs2svn_lib/changeset.py new file mode 100644 index 0000000..1022e0a --- /dev/null +++ b/cvs2svn_lib/changeset.py @@ -0,0 +1,269 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Manage change sets.""" + + +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag +from cvs2svn_lib.time_range import TimeRange +from cvs2svn_lib.changeset_graph_node import ChangesetGraphNode + + +class Changeset(object): + """A set of cvs_items that might potentially form a single change set.""" + + def __init__(self, id, cvs_item_ids): + self.id = id + self.cvs_item_ids = list(cvs_item_ids) + + def iter_cvs_items(self): + """Yield the CVSItems within this Changeset.""" + + for (id, cvs_item) in Ctx()._cvs_items_db.get_many(self.cvs_item_ids): + assert cvs_item is not None + yield cvs_item + + def get_projects_opened(self): + """Return the set of projects that might be opened by this changeset.""" + + raise NotImplementedError() + + def create_graph_node(self, cvs_item_to_changeset_id): + """Return a ChangesetGraphNode for this Changeset.""" + + raise NotImplementedError() + + def create_split_changeset(self, id, cvs_item_ids): + """Return a Changeset with the specified contents. + + This method is only implemented for changesets that can be split. + The type of the new changeset should be the same as that of SELF, + and any other information from SELF should also be copied to the + new changeset.""" + + raise NotImplementedError() + + def __getstate__(self): + return (self.id, self.cvs_item_ids,) + + def __setstate__(self, state): + (self.id, self.cvs_item_ids,) = state + + def __cmp__(self, other): + raise NotImplementedError() + + def __str__(self): + raise NotImplementedError() + + def __repr__(self): + return '%s [%s]' % ( + self, ', '.join(['%x' % id for id in self.cvs_item_ids]),) + + +class RevisionChangeset(Changeset): + """A Changeset consisting of CVSRevisions.""" + + _sort_order = 3 + + def create_graph_node(self, cvs_item_to_changeset_id): + time_range = TimeRange() + pred_ids = set() + succ_ids = set() + + for cvs_item in self.iter_cvs_items(): + time_range.add(cvs_item.timestamp) + + for pred_id in cvs_item.get_pred_ids(): + changeset_id = cvs_item_to_changeset_id.get(pred_id) + if changeset_id is not None: + pred_ids.add(changeset_id) + + for succ_id in cvs_item.get_succ_ids(): + changeset_id = cvs_item_to_changeset_id.get(succ_id) + if changeset_id is not None: + succ_ids.add(changeset_id) + + return ChangesetGraphNode(self, time_range, pred_ids, succ_ids) + + def create_split_changeset(self, id, cvs_item_ids): + return RevisionChangeset(id, cvs_item_ids) + + def __cmp__(self, other): + return cmp(self._sort_order, other._sort_order) \ + or cmp(self.id, other.id) + + def __str__(self): + return 'RevisionChangeset<%x>' % (self.id,) + + +class OrderedChangeset(Changeset): + """A Changeset of CVSRevisions whose preliminary order is known. + + The first changeset ordering involves only RevisionChangesets, and + results in a full ordering of RevisionChangesets (i.e., a linear + chain of dependencies with the order consistent with the + dependencies). These OrderedChangesets form the skeleton for the + full topological sort that includes SymbolChangesets as well.""" + + _sort_order = 2 + + def __init__(self, id, cvs_item_ids, ordinal, prev_id, next_id): + Changeset.__init__(self, id, cvs_item_ids) + + # The order of this changeset among all OrderedChangesets: + self.ordinal = ordinal + + # The changeset id of the previous OrderedChangeset, or None if + # this is the first OrderedChangeset: + self.prev_id = prev_id + + # The changeset id of the next OrderedChangeset, or None if this + # is the last OrderedChangeset: + self.next_id = next_id + + def get_projects_opened(self): + retval = set() + for cvs_item in self.iter_cvs_items(): + retval.add(cvs_item.cvs_file.project) + return retval + + def create_graph_node(self, cvs_item_to_changeset_id): + time_range = TimeRange() + + pred_ids = set() + succ_ids = set() + + if self.prev_id is not None: + pred_ids.add(self.prev_id) + + if self.next_id is not None: + succ_ids.add(self.next_id) + + for cvs_item in self.iter_cvs_items(): + time_range.add(cvs_item.timestamp) + + for pred_id in cvs_item.get_symbol_pred_ids(): + changeset_id = cvs_item_to_changeset_id.get(pred_id) + if changeset_id is not None: + pred_ids.add(changeset_id) + + for succ_id in cvs_item.get_symbol_succ_ids(): + changeset_id = cvs_item_to_changeset_id.get(succ_id) + if changeset_id is not None: + succ_ids.add(changeset_id) + + return ChangesetGraphNode(self, time_range, pred_ids, succ_ids) + + def __getstate__(self): + return ( + Changeset.__getstate__(self), + self.ordinal, self.prev_id, self.next_id,) + + def __setstate__(self, state): + (changeset_state, self.ordinal, self.prev_id, self.next_id,) = state + Changeset.__setstate__(self, changeset_state) + + def __cmp__(self, other): + return cmp(self._sort_order, other._sort_order) \ + or cmp(self.id, other.id) + + def __str__(self): + return 'OrderedChangeset<%x(%d)>' % (self.id, self.ordinal,) + + +class SymbolChangeset(Changeset): + """A Changeset consisting of CVSSymbols.""" + + def __init__(self, id, symbol, cvs_item_ids): + Changeset.__init__(self, id, cvs_item_ids) + self.symbol = symbol + + def get_projects_opened(self): + # A SymbolChangeset can never open a project. + return set() + + def create_graph_node(self, cvs_item_to_changeset_id): + pred_ids = set() + succ_ids = set() + + for cvs_item in self.iter_cvs_items(): + for pred_id in cvs_item.get_pred_ids(): + changeset_id = cvs_item_to_changeset_id.get(pred_id) + if changeset_id is not None: + pred_ids.add(changeset_id) + + for succ_id in cvs_item.get_succ_ids(): + changeset_id = cvs_item_to_changeset_id.get(succ_id) + if changeset_id is not None: + succ_ids.add(changeset_id) + + return ChangesetGraphNode(self, TimeRange(), pred_ids, succ_ids) + + def __cmp__(self, other): + return cmp(self._sort_order, other._sort_order) \ + or cmp(self.symbol, other.symbol) \ + or cmp(self.id, other.id) + + def __getstate__(self): + return (Changeset.__getstate__(self), self.symbol.id,) + + def __setstate__(self, state): + (changeset_state, symbol_id) = state + Changeset.__setstate__(self, changeset_state) + self.symbol = Ctx()._symbol_db.get_symbol(symbol_id) + + +class BranchChangeset(SymbolChangeset): + """A Changeset consisting of CVSBranches.""" + + _sort_order = 1 + + def create_split_changeset(self, id, cvs_item_ids): + return BranchChangeset(id, self.symbol, cvs_item_ids) + + def __str__(self): + return 'BranchChangeset<%x>("%s")' % (self.id, self.symbol,) + + +class TagChangeset(SymbolChangeset): + """A Changeset consisting of CVSTags.""" + + _sort_order = 0 + + def create_split_changeset(self, id, cvs_item_ids): + return TagChangeset(id, self.symbol, cvs_item_ids) + + def __str__(self): + return 'TagChangeset<%x>("%s")' % (self.id, self.symbol,) + + +def create_symbol_changeset(id, symbol, cvs_item_ids): + """Factory function for SymbolChangesets. + + Return a BranchChangeset or TagChangeset, depending on the type of + SYMBOL. SYMBOL must be a Branch or Tag.""" + + if isinstance(symbol, Branch): + return BranchChangeset(id, symbol, cvs_item_ids) + if isinstance(symbol, Tag): + return TagChangeset(id, symbol, cvs_item_ids) + else: + raise InternalError('Unknown symbol type %s' % (symbol,)) + + diff --git a/cvs2svn_lib/changeset_database.py b/cvs2svn_lib/changeset_database.py new file mode 100644 index 0000000..82ca904 --- /dev/null +++ b/cvs2svn_lib/changeset_database.py @@ -0,0 +1,70 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to store changesets.""" + + +from cvs2svn_lib.changeset import Changeset +from cvs2svn_lib.changeset import RevisionChangeset +from cvs2svn_lib.changeset import OrderedChangeset +from cvs2svn_lib.changeset import SymbolChangeset +from cvs2svn_lib.changeset import BranchChangeset +from cvs2svn_lib.changeset import TagChangeset +from cvs2svn_lib.record_table import UnsignedIntegerPacker +from cvs2svn_lib.record_table import MmapRecordTable +from cvs2svn_lib.record_table import RecordTable +from cvs2svn_lib.database import IndexedStore +from cvs2svn_lib.serializer import PrimedPickleSerializer + + +# Should the CVSItemToChangesetTable database files be memory mapped? +# This speeds up the converstion but can cause the computer's virtual +# address space to be exhausted. This option can be changed +# externally, affecting any CVSItemToChangesetTables opened subsequent +# to the change: +use_mmap_for_cvs_item_to_changeset_table = False + + +def CVSItemToChangesetTable(filename, mode): + if use_mmap_for_cvs_item_to_changeset_table: + return MmapRecordTable(filename, mode, UnsignedIntegerPacker()) + else: + return RecordTable(filename, mode, UnsignedIntegerPacker()) + + +class ChangesetDatabase(IndexedStore): + def __init__(self, filename, index_filename, mode): + primer = ( + Changeset, + RevisionChangeset, + OrderedChangeset, + SymbolChangeset, + BranchChangeset, + TagChangeset, + ) + IndexedStore.__init__( + self, filename, index_filename, mode, PrimedPickleSerializer(primer)) + + def store(self, changeset): + self.add(changeset) + + def keys(self): + return list(self.iterkeys()) + + def close(self): + IndexedStore.close(self) + + diff --git a/cvs2svn_lib/changeset_graph.py b/cvs2svn_lib/changeset_graph.py new file mode 100644 index 0000000..64ebf2c --- /dev/null +++ b/cvs2svn_lib/changeset_graph.py @@ -0,0 +1,456 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""The changeset dependency graph.""" + + +from cvs2svn_lib.log import Log +from cvs2svn_lib.changeset import RevisionChangeset +from cvs2svn_lib.changeset import OrderedChangeset +from cvs2svn_lib.changeset import BranchChangeset +from cvs2svn_lib.changeset import TagChangeset + + +class CycleInGraphException(Exception): + def __init__(self, cycle): + Exception.__init__( + self, + 'Cycle found in graph: %s' + % ' -> '.join(map(str, cycle + [cycle[0]]))) + + +class NoPredNodeInGraphException(Exception): + def __init__(self, node): + Exception.__init__(self, 'Node %s has no predecessors' % (node,)) + + +class _NoPredNodes: + """Manage changesets that are to be processed. + + Output the changesets in order by time and changeset type. + + The implementation of this class is crude: as changesets are added, + they are appended to a list. When one is needed, the list is sorted + in reverse order and then the last changeset in the list is + returned. To reduce the number of sorts that are needed, the class + keeps track of whether the list is currently sorted. + + All this repeated sorting is wasteful and unnecessary. We should + instead use a heap to output the changeset order, which would + require O(lg N) work per add()/get() rather than O(1) and O(N lg N) + as in the current implementation [1]. But: (1) the lame interface + of heapq doesn't allow an arbitrary compare function, so we would + have to store extra information in the array elements; (2) in + practice, the number of items in the list at any time is only a tiny + fraction of the total number of changesets; and (3) testing showed + that the heapq implementation is no faster than this one (perhaps + because of the increased memory usage). + + [1] According to Objects/listsort.txt in the Python source code, the + Python list-sorting code is heavily optimized for arrays that have + runs of already-sorted elements, so the current cost of get() is + probably closer to O(N) than O(N lg N).""" + + def __init__(self, changeset_db): + self.changeset_db = changeset_db + # A list [(node, changeset,)] of nodes with no predecessors: + self._nodes = [] + self._sorted = True + + def __len__(self): + return len(self._nodes) + + @staticmethod + def _compare((node_1, changeset_1), (node_2, changeset_2)): + """Define a (reverse) ordering on self._nodes.""" + + return cmp(node_2.time_range, node_1.time_range) \ + or cmp(changeset_2, changeset_1) + + def add(self, node): + self._nodes.append( (node, self.changeset_db[node.id],) ) + self._sorted = False + + def get(self): + """Return (node, changeset,) of the smallest node. + + 'Smallest' is defined by self._compare().""" + + if not self._sorted: + self._nodes.sort(self._compare) + self._sorted = True + return self._nodes.pop() + + +class ChangesetGraph(object): + """A graph of changesets and their dependencies.""" + + def __init__(self, changeset_db, cvs_item_to_changeset_id): + self._changeset_db = changeset_db + self._cvs_item_to_changeset_id = cvs_item_to_changeset_id + # A map { id : ChangesetGraphNode } + self.nodes = {} + + def close(self): + self._cvs_item_to_changeset_id.close() + self._cvs_item_to_changeset_id = None + self._changeset_db.close() + self._changeset_db = None + + def add_changeset(self, changeset): + """Add CHANGESET to this graph. + + Determine and record any dependencies to changesets that are + already in the graph. This method does not affect the databases.""" + + node = changeset.create_graph_node(self._cvs_item_to_changeset_id) + + # Now tie the node into our graph. If a changeset referenced by + # node is already in our graph, then add the backwards connection + # from the other node to the new one. If not, then delete the + # changeset from node. + + for pred_id in list(node.pred_ids): + pred_node = self.nodes.get(pred_id) + if pred_node is not None: + pred_node.succ_ids.add(node.id) + else: + node.pred_ids.remove(pred_id) + + for succ_id in list(node.succ_ids): + succ_node = self.nodes.get(succ_id) + if succ_node is not None: + succ_node.pred_ids.add(node.id) + else: + node.succ_ids.remove(succ_id) + + self.nodes[node.id] = node + + def store_changeset(self, changeset): + for cvs_item_id in changeset.cvs_item_ids: + self._cvs_item_to_changeset_id[cvs_item_id] = changeset.id + self._changeset_db.store(changeset) + + def add_new_changeset(self, changeset): + """Add the new CHANGESET to the graph and also to the databases.""" + + if Log().is_on(Log.DEBUG): + Log().debug('Adding changeset %r' % (changeset,)) + + self.add_changeset(changeset) + self.store_changeset(changeset) + + def delete_changeset(self, changeset): + """Remove CHANGESET from the graph and also from the databases. + + In fact, we don't remove CHANGESET from + self._cvs_item_to_changeset_id, because in practice the CVSItems + in CHANGESET are always added again as part of a new CHANGESET, + which will cause the old values to be overwritten.""" + + if Log().is_on(Log.DEBUG): + Log().debug('Removing changeset %r' % (changeset,)) + + del self[changeset.id] + del self._changeset_db[changeset.id] + + def __nonzero__(self): + """Instances are considered True iff they contain any nodes.""" + + return bool(self.nodes) + + def __contains__(self, id): + """Return True if the specified ID is contained in this graph.""" + + return id in self.nodes + + def __getitem__(self, id): + return self.nodes[id] + + def get(self, id): + return self.nodes.get(id) + + def __delitem__(self, id): + """Remove the node corresponding to ID. + + Also remove references to it from other nodes. This method does + not change pred_ids or succ_ids of the node being deleted, nor + does it affect the databases.""" + + node = self[id] + + for succ_id in node.succ_ids: + succ = self[succ_id] + succ.pred_ids.remove(node.id) + + for pred_id in node.pred_ids: + pred = self[pred_id] + pred.succ_ids.remove(node.id) + + del self.nodes[node.id] + + def keys(self): + return self.nodes.keys() + + def __iter__(self): + return self.nodes.itervalues() + + def _get_path(self, reachable_changesets, starting_node_id, ending_node_id): + """Return the shortest path from ENDING_NODE_ID to STARTING_NODE_ID. + + Find a path from ENDING_NODE_ID to STARTING_NODE_ID in + REACHABLE_CHANGESETS, where STARTING_NODE_ID is the id of a + changeset that depends on the changeset with ENDING_NODE_ID. (See + the comment in search_for_path() for a description of the format + of REACHABLE_CHANGESETS.) + + Return a list of changesets, where the 0th one has ENDING_NODE_ID + and the last one has STARTING_NODE_ID. If there is no such path + described in in REACHABLE_CHANGESETS, return None.""" + + if ending_node_id not in reachable_changesets: + return None + + path = [self._changeset_db[ending_node_id]] + id = reachable_changesets[ending_node_id][1] + while id != starting_node_id: + path.append(self._changeset_db[id]) + id = reachable_changesets[id][1] + path.append(self._changeset_db[starting_node_id]) + return path + + def search_for_path(self, starting_node_id, stop_set): + """Search for paths to prerequisites of STARTING_NODE_ID. + + Try to find the shortest dependency path that causes the changeset + with STARTING_NODE_ID to depend (directly or indirectly) on one of + the changesets whose ids are contained in STOP_SET. + + We consider direct and indirect dependencies in the sense that the + changeset can be reached by following a chain of predecessor nodes. + + When one of the changeset_ids in STOP_SET is found, terminate the + search and return the path from that changeset_id to + STARTING_NODE_ID. If no path is found to a node in STOP_SET, + return None.""" + + # A map {node_id : (steps, next_node_id)} where NODE_ID can be + # reached from STARTING_NODE_ID in STEPS steps, and NEXT_NODE_ID + # is the id of the previous node in the path. STARTING_NODE_ID is + # only included as a key if there is a loop leading back to it. + reachable_changesets = {} + + # A list of (node_id, steps) that still have to be investigated, + # and STEPS is the number of steps to get to NODE_ID. + open_nodes = [(starting_node_id, 0)] + # A breadth-first search: + while open_nodes: + (id, steps) = open_nodes.pop(0) + steps += 1 + node = self[id] + for pred_id in node.pred_ids: + # Since the search is breadth-first, we only have to set steps + # that don't already exist. + if pred_id not in reachable_changesets: + reachable_changesets[pred_id] = (steps, id) + open_nodes.append((pred_id, steps)) + + # See if we can stop now: + if pred_id in stop_set: + return self._get_path( + reachable_changesets, starting_node_id, pred_id + ) + + return None + + def consume_nopred_nodes(self): + """Remove and yield changesets in dependency order. + + Each iteration, this generator yields a (changeset, time_range) + tuple for the oldest changeset in the graph that doesn't have any + predecessor nodes (i.e., it is ready to be committed). This is + continued until there are no more nodes without predecessors + (either because the graph has been emptied, or because of cycles + in the graph). + + Among the changesets that are ready to be processed, the earliest + one (according to the sorting of the TimeRange class) is yielded + each time. (This is the order in which the changesets should be + committed.) + + The graph should not be otherwise altered while this generator is + running.""" + + # Find a list of (node,changeset,) where the node has no + # predecessors: + nopred_nodes = _NoPredNodes(self._changeset_db) + for node in self.nodes.itervalues(): + if not node.pred_ids: + nopred_nodes.add(node) + + while nopred_nodes: + (node, changeset,) = nopred_nodes.get() + del self[node.id] + # See if any successors are now ready for extraction: + for succ_id in node.succ_ids: + succ = self[succ_id] + if not succ.pred_ids: + nopred_nodes.add(succ) + yield (changeset, node.time_range) + + def find_cycle(self, starting_node_id): + """Find a cycle in the dependency graph and return it. + + Use STARTING_NODE_ID as the place to start looking. This routine + must only be called after all nopred_nodes have been removed. + Return the list of changesets that are involved in the cycle + (ordered such that cycle[n-1] is a predecessor of cycle[n] and + cycle[-1] is a predecessor of cycle[0]).""" + + # Since there are no nopred nodes in the graph, all nodes in the + # graph must either be involved in a cycle or depend (directly or + # indirectly) on nodes that are in a cycle. + + # Pick an arbitrary node: + node = self[starting_node_id] + + seen_nodes = [node] + + # Follow it backwards until a node is seen a second time; then we + # have our cycle. + while True: + # Pick an arbitrary predecessor of node. It must exist, because + # there are no nopred nodes: + try: + node_id = node.pred_ids.__iter__().next() + except StopIteration: + raise NoPredNodeInGraphException(node) + node = self[node_id] + try: + i = seen_nodes.index(node) + except ValueError: + seen_nodes.append(node) + else: + seen_nodes = seen_nodes[i:] + seen_nodes.reverse() + return [self._changeset_db[node.id] for node in seen_nodes] + + def consume_graph(self, cycle_breaker=None): + """Remove and yield changesets from this graph in dependency order. + + Each iteration, this generator yields a (changeset, time_range) + tuple for the oldest changeset in the graph that doesn't have any + predecessor nodes. If CYCLE_BREAKER is specified, then call + CYCLE_BREAKER(cycle) whenever a cycle is encountered, where cycle + is the list of changesets that are involved in the cycle (ordered + such that cycle[n-1] is a predecessor of cycle[n] and cycle[-1] is + a predecessor of cycle[0]). CYCLE_BREAKER should break the cycle + in place then return. + + If a cycle is found and CYCLE_BREAKER was not specified, raise + CycleInGraphException.""" + + while True: + for (changeset, time_range) in self.consume_nopred_nodes(): + yield (changeset, time_range) + + # If there are any nodes left in the graph, then there must be + # at least one cycle. Find a cycle and process it. + + # This might raise StopIteration, but that indicates that the + # graph has been fully consumed, so we just let the exception + # escape. + start_node_id = self.nodes.iterkeys().next() + + cycle = self.find_cycle(start_node_id) + + if cycle_breaker is not None: + cycle_breaker(cycle) + else: + raise CycleInGraphException(cycle) + + def __repr__(self): + """For convenience only. The format is subject to change at any time.""" + + if self.nodes: + return 'ChangesetGraph:\n%s' \ + % ''.join([' %r\n' % node for node in self]) + else: + return 'ChangesetGraph:\n EMPTY\n' + + node_colors = { + RevisionChangeset : 'lightgreen', + OrderedChangeset : 'cyan', + BranchChangeset : 'orange', + TagChangeset : 'yellow', + } + + def output_coarse_dot(self, f): + """Output the graph in DOT format to file-like object f. + + Such a file can be rendered into a visual representation of the + graph using tools like graphviz. Include only changesets in the + graph, and the dependencies between changesets.""" + + f.write('digraph G {\n') + for node in self: + f.write( + ' C%x [style=filled, fillcolor=%s];\n' % ( + node.id, + self.node_colors[self._changeset_db[node.id].__class__], + ) + ) + f.write('\n') + + for node in self: + for succ_id in node.succ_ids: + f.write(' C%x -> C%x\n' % (node.id, succ_id,)) + f.write('\n') + + f.write('}\n') + + def output_fine_dot(self, f): + """Output the graph in DOT format to file-like object f. + + Such a file can be rendered into a visual representation of the + graph using tools like graphviz. Include all CVSItems and the + CVSItem-CVSItem dependencies in the graph. Group the CVSItems + into clusters by changeset.""" + + f.write('digraph G {\n') + for node in self: + f.write(' subgraph cluster_%x {\n' % (node.id,)) + f.write(' label = "C%x";\n' % (node.id,)) + changeset = self._changeset_db[node.id] + for item_id in changeset.cvs_item_ids: + f.write(' I%x;\n' % (item_id,)) + f.write(' style=filled;\n') + f.write( + ' fillcolor=%s;\n' + % (self.node_colors[self._changeset_db[node.id].__class__],)) + f.write(' }\n\n') + + for node in self: + changeset = self._changeset_db[node.id] + for cvs_item in changeset.iter_cvs_items(): + for succ_id in cvs_item.get_succ_ids(): + f.write(' I%x -> I%x;\n' % (cvs_item.id, succ_id,)) + + f.write('\n') + + f.write('}\n') + + diff --git a/cvs2svn_lib/changeset_graph_link.py b/cvs2svn_lib/changeset_graph_link.py new file mode 100644 index 0000000..9d0cc9d --- /dev/null +++ b/cvs2svn_lib/changeset_graph_link.py @@ -0,0 +1,149 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Keep track of counts of different types of changeset links.""" + + + +# A cvs_item doesn't depend on any cvs_items in either pred or succ: +LINK_NONE = 0 + +# A cvs_item depends on one or more cvs_items in pred but none in succ: +LINK_PRED = 1 + +# A cvs_item depends on one or more cvs_items in succ but none in pred: +LINK_SUCC = 2 + +# A cvs_item depends on one or more cvs_items in both pred and succ: +LINK_PASSTHRU = LINK_PRED | LINK_SUCC + + +class ChangesetGraphLink(object): + def __init__(self, pred, changeset, succ): + """Represent a link in a loop in a changeset graph. + + This is the link that goes from PRED -> CHANGESET -> SUCC. + + We are mainly concerned with how many CVSItems have LINK_PRED, + LINK_SUCC, and LINK_PASSTHRU type links to the neighboring + commitsets. If necessary, this class can also break up CHANGESET + into multiple changesets.""" + + self.pred = pred + self.pred_ids = set(pred.cvs_item_ids) + + self.changeset = changeset + + self.succ_ids = set(succ.cvs_item_ids) + self.succ = succ + + # A count of each type of link for cvs_items in changeset + # (indexed by LINK_* constants): + link_counts = [0] * 4 + + for cvs_item in list(changeset.iter_cvs_items()): + link_counts[self.get_link_type(cvs_item)] += 1 + + [self.pred_links, self.succ_links, self.passthru_links] = link_counts[1:] + + def get_link_type(self, cvs_item): + """Return the type of links from CVS_ITEM to self.PRED and self.SUCC. + + The return value is one of LINK_NONE, LINK_PRED, LINK_SUCC, or + LINK_PASSTHRU.""" + + retval = LINK_NONE + + if cvs_item.get_pred_ids() & self.pred_ids: + retval |= LINK_PRED + if cvs_item.get_succ_ids() & self.succ_ids: + retval |= LINK_SUCC + + return retval + + def get_links_to_move(self): + """Return the number of items that would be moved to split changeset.""" + + return min(self.pred_links, self.succ_links) \ + or max(self.pred_links, self.succ_links) + + def is_breakable(self): + """Return True iff breaking the changeset will do any good.""" + + return self.pred_links != 0 or self.succ_links != 0 + + def __cmp__(self, other): + """Compare SELF with OTHER in terms of which would be better to break. + + The one that is better to break is considered the lesser.""" + + return ( + - cmp(int(self.is_breakable()), int(other.is_breakable())) + or cmp(self.passthru_links, other.passthru_links) + or cmp(self.get_links_to_move(), other.get_links_to_move()) + ) + + def break_changeset(self, changeset_key_generator): + """Break up self.changeset and return the fragments. + + Break it up in such a way that the link is weakened as efficiently + as possible.""" + + if not self.is_breakable(): + raise ValueError('Changeset is not breakable: %r' % self.changeset) + + pred_items = [] + succ_items = [] + + # For each link type, should such CVSItems be moved to the + # changeset containing the predecessor items or the one containing + # the successor items? + destination = { + LINK_PRED : pred_items, + LINK_SUCC : succ_items, + } + + if self.pred_links == 0: + destination[LINK_NONE] = pred_items + destination[LINK_PASSTHRU] = pred_items + elif self.succ_links == 0: + destination[LINK_NONE] = succ_items + destination[LINK_PASSTHRU] = succ_items + elif self.pred_links < self.succ_links: + destination[LINK_NONE] = succ_items + destination[LINK_PASSTHRU] = succ_items + else: + destination[LINK_NONE] = pred_items + destination[LINK_PASSTHRU] = pred_items + + for cvs_item in self.changeset.iter_cvs_items(): + link_type = self.get_link_type(cvs_item) + destination[link_type].append(cvs_item.id) + + # Create new changesets of the same type as the old one: + return [ + self.changeset.create_split_changeset( + changeset_key_generator.gen_id(), pred_items), + self.changeset.create_split_changeset( + changeset_key_generator.gen_id(), succ_items), + ] + + def __str__(self): + return 'Link<%x>(%d, %d, %d)' % ( + self.changeset.id, + self.pred_links, self.succ_links, self.passthru_links) + + diff --git a/cvs2svn_lib/changeset_graph_node.py b/cvs2svn_lib/changeset_graph_node.py new file mode 100644 index 0000000..cbbebd7 --- /dev/null +++ b/cvs2svn_lib/changeset_graph_node.py @@ -0,0 +1,50 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""A node in the changeset dependency graph.""" + + +class ChangesetGraphNode(object): + """A node in the changeset dependency graph.""" + + __slots__ = ['id', 'time_range', 'pred_ids', 'succ_ids'] + + def __init__(self, changeset, time_range, pred_ids, succ_ids): + # The id of the ChangesetGraphNode is the same as the id of the + # changeset. + self.id = changeset.id + + # The range of times of CVSItems within this Changeset. + self.time_range = time_range + + # The set of changeset ids of changesets that are direct + # predecessors of this one. + self.pred_ids = pred_ids + + # The set of changeset ids of changesets that are direct + # successors of this one. + self.succ_ids = succ_ids + + def __repr__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%x; pred=[%s]; succ=[%s]' % ( + self.id, + ','.join(['%x' % id for id in self.pred_ids]), + ','.join(['%x' % id for id in self.succ_ids]), + ) + + diff --git a/cvs2svn_lib/check_dependencies_pass.py b/cvs2svn_lib/check_dependencies_pass.py new file mode 100644 index 0000000..172c264 --- /dev/null +++ b/cvs2svn_lib/check_dependencies_pass.py @@ -0,0 +1,144 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module defines some passes that can be used for debugging cv2svn.""" + + +from cvs2svn_lib import config +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.common import FatalException +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.log import Log +from cvs2svn_lib.pass_manager import Pass +from cvs2svn_lib.project import read_projects +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.cvs_file_database import CVSFileDatabase +from cvs2svn_lib.symbol_database import SymbolDatabase +from cvs2svn_lib.cvs_item_database import OldCVSItemStore +from cvs2svn_lib.cvs_item_database import IndexedCVSItemStore + + +class CheckDependenciesPass(Pass): + """Check that the dependencies are self-consistent.""" + + def __init__(self): + Pass.__init__(self) + + def register_artifacts(self): + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + + def iter_cvs_items(self): + raise NotImplementedError() + + def get_cvs_item(self, item_id): + raise NotImplementedError() + + def run(self, run_options, stats_keeper): + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + self.symbol_db = SymbolDatabase() + Ctx()._symbol_db = self.symbol_db + + Log().quiet("Checking dependency consistency...") + + fatal_errors = [] + for cvs_item in self.iter_cvs_items(): + # Check that the pred_ids and succ_ids are mutually consistent: + for pred_id in cvs_item.get_pred_ids(): + pred = self.get_cvs_item(pred_id) + if not cvs_item.id in pred.get_succ_ids(): + fatal_errors.append( + '%s lists pred=%s, but not vice versa.' % (cvs_item, pred,)) + + for succ_id in cvs_item.get_succ_ids(): + succ = self.get_cvs_item(succ_id) + if not cvs_item.id in succ.get_pred_ids(): + fatal_errors.append( + '%s lists succ=%s, but not vice versa.' % (cvs_item, succ,)) + + if fatal_errors: + raise FatalException( + 'Dependencies inconsistent:\n' + '%s\n' + 'Exited due to fatal error(s).' + % ('\n'.join(fatal_errors),) + ) + + self.symbol_db.close() + self.symbol_db = None + Ctx()._cvs_file_db.close() + Log().quiet("Done") + + +class CheckItemStoreDependenciesPass(CheckDependenciesPass): + def __init__(self, cvs_items_store_file): + CheckDependenciesPass.__init__(self) + self.cvs_items_store_file = cvs_items_store_file + + def register_artifacts(self): + CheckDependenciesPass.register_artifacts(self) + self._register_temp_file_needed(self.cvs_items_store_file) + + def iter_cvs_items(self): + cvs_item_store = OldCVSItemStore( + artifact_manager.get_temp_file(self.cvs_items_store_file)) + + for cvs_file_items in cvs_item_store.iter_cvs_file_items(): + self.current_cvs_file_items = cvs_file_items + for cvs_item in cvs_file_items.values(): + yield cvs_item + + del self.current_cvs_file_items + + cvs_item_store.close() + + def get_cvs_item(self, item_id): + return self.current_cvs_file_items[item_id] + + +class CheckIndexedItemStoreDependenciesPass(CheckDependenciesPass): + def __init__(self, cvs_items_store_file, cvs_items_store_index_file): + CheckDependenciesPass.__init__(self) + self.cvs_items_store_file = cvs_items_store_file + self.cvs_items_store_index_file = cvs_items_store_index_file + + def register_artifacts(self): + CheckDependenciesPass.register_artifacts(self) + self._register_temp_file_needed(self.cvs_items_store_file) + self._register_temp_file_needed(self.cvs_items_store_index_file) + + def iter_cvs_items(self): + return self.cvs_item_store.itervalues() + + def get_cvs_item(self, item_id): + return self.cvs_item_store[item_id] + + def run(self, run_options, stats_keeper): + self.cvs_item_store = IndexedCVSItemStore( + artifact_manager.get_temp_file(self.cvs_items_store_file), + artifact_manager.get_temp_file(self.cvs_items_store_index_file), + DB_OPEN_READ) + + CheckDependenciesPass.run(self, run_options, stats_keeper) + + self.cvs_item_store.close() + self.cvs_item_store = None + + diff --git a/cvs2svn_lib/checkout_internal.py b/cvs2svn_lib/checkout_internal.py new file mode 100644 index 0000000..fe28e0c --- /dev/null +++ b/cvs2svn_lib/checkout_internal.py @@ -0,0 +1,778 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes that implement the --use-internal-co option. + +The idea is to patch up the revisions' contents incrementally, thus +avoiding the huge number of process spawns and the O(n^2) overhead of +using 'co' and 'cvs'. + +InternalRevisionRecorder saves the RCS deltas and RCS revision trees +to databases. Notably, deltas from the trunk need to be reversed, as +CVS stores them so they apply from HEAD backwards. + +InternalRevisionExcluder copies the revision trees to a new database, +omitting excluded branches. + +InternalRevisionReader produces the revisions' contents on demand. To +generate the text for a typical revision, we need the revision's delta +text plus the fulltext of the previous revision. Therefore, we +maintain a checkout database containing a copy of the fulltext of any +revision for which subsequent revisions still need to be retrieved. +It is crucial to remove text from this database as soon as it is no +longer needed, to prevent it from growing enormous. + +There are two reasons that the text from a revision can be needed: (1) +because the revision itself still needs to be output to a dumpfile; +(2) because another revision needs it as the base of its delta. We +maintain a reference count for each revision, which includes *both* +possibilities. The first time a revision's text is needed, it is +generated by applying the revision's deltatext to the previous +revision's fulltext, and the resulting fulltext is stored in the +checkout database. Each time a revision's fulltext is retrieved, its +reference count is decremented. When the reference count goes to +zero, then the fulltext is deleted from the checkout database. + +The administrative data for managing this consists of one TextRecord +entry for each revision. Each TextRecord has an id, which is the same +id as used for the corresponding CVSRevision instance. It also +maintains a count of the times it is expected to be retrieved. +TextRecords come in several varieties: + +FullTextRecord -- Used for revisions whose fulltext is contained + directly in the RCS file, and therefore available during + CollectRevsPass (i.e., typically revision 1.1 of each file). + +DeltaTextRecord -- Used for revisions that are defined via a delta + relative to some other TextRecord. These records record the id of + the TextRecord that holds the base text against which the delta is + defined. When the text for a DeltaTextRecord is retrieved, the + DeltaTextRecord instance is deleted and a CheckedOutTextRecord + instance is created to take its place. + +CheckedOutTextRecord -- Used during OutputPass for a revision that + started out as a DeltaTextRecord, but has already been retrieved + (and therefore its fulltext is stored in the checkout database). + +While a file is being processed during CollectRevsPass, the fulltext +and deltas are stored to the delta database, and TextRecord instances +are created to keep track of things. The reference counts are all +initialized to zero. + +After CollectRevsPass has done any preliminary tree mangling, its +_FileDataCollector.parse_completed(), method calls +RevisionRecorder.finish_file(), passing it the CVSFileItems instance +that describes the revisions in the file. At this point the reference +counts for the file's TextRecords are updated: each record referred to +by a delta has its refcount incremented, and each record that +corresponds to a non-delete CVSRevision is incremented. After that, +any records with refcount==0 are removed. When one record is removed, +that can cause another record's reference count to go to zero and be +removed too, recursively. When a TextRecord is deleted at this stage, +its deltatext is also deleted from the delta database. + +In FilterSymbolsPass, the exact same procedure (described in the +previous paragraph) is repeated, but this time using the CVSFileItems +after it has been updated for excluded symbols, symbol +preferred-parent grafting, etc.""" + + +import cStringIO +import re +import time + +from cvs2svn_lib import config +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.log import Log +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.cvs_item import CVSRevisionModification +from cvs2svn_lib.database import Database +from cvs2svn_lib.database import IndexedDatabase +from cvs2svn_lib.rcs_stream import RCSStream +from cvs2svn_lib.rcs_stream import MalformedDeltaException +from cvs2svn_lib.revision_manager import RevisionRecorder +from cvs2svn_lib.revision_manager import RevisionExcluder +from cvs2svn_lib.revision_manager import RevisionReader +from cvs2svn_lib.serializer import MarshalSerializer +from cvs2svn_lib.serializer import CompressingSerializer +from cvs2svn_lib.serializer import PrimedPickleSerializer + + +class TextRecord(object): + """Bookkeeping data for the text of a single CVSRevision.""" + + __slots__ = ['id', 'refcount'] + + def __init__(self, id): + # The cvs_rev_id of the revision whose text this is. + self.id = id + + # The number of times that the text of this revision will be + # retrieved. + self.refcount = 0 + + def __getstate__(self): + return (self.id, self.refcount,) + + def __setstate__(self, state): + (self.id, self.refcount,) = state + + def increment_dependency_refcounts(self, text_record_db): + """Increment the refcounts of any records that this one depends on.""" + + pass + + def decrement_refcount(self, text_record_db): + """Decrement the number of times our text still has to be checked out. + + If the reference count goes to zero, call discard().""" + + self.refcount -= 1 + if self.refcount == 0: + text_record_db.discard(self.id) + + def checkout(self, text_record_db): + """Workhorse of the checkout process. + + Return the text for this revision, decrement our reference count, + and update the databases depending on whether there will be future + checkouts.""" + + raise NotImplementedError() + + def free(self, text_record_db): + """This instance will never again be checked out; free it. + + Also free any associated resources and decrement the refcounts of + any other TextRecords that this one depends on.""" + + raise NotImplementedError() + + +class FullTextRecord(TextRecord): + __slots__ = [] + + def __getstate__(self): + return (self.id, self.refcount,) + + def __setstate__(self, state): + (self.id, self.refcount,) = state + + def checkout(self, text_record_db): + text = text_record_db.delta_db[self.id] + self.decrement_refcount(text_record_db) + return text + + def free(self, text_record_db): + del text_record_db.delta_db[self.id] + + def __str__(self): + return 'FullTextRecord(%x, %d)' % (self.id, self.refcount,) + + +class DeltaTextRecord(TextRecord): + __slots__ = ['pred_id'] + + def __init__(self, id, pred_id): + TextRecord.__init__(self, id) + + # The cvs_rev_id of the revision relative to which this delta is + # defined. + self.pred_id = pred_id + + def __getstate__(self): + return (self.id, self.refcount, self.pred_id,) + + def __setstate__(self, state): + (self.id, self.refcount, self.pred_id,) = state + + def increment_dependency_refcounts(self, text_record_db): + text_record_db[self.pred_id].refcount += 1 + + def checkout(self, text_record_db): + base_text = text_record_db[self.pred_id].checkout(text_record_db) + co = RCSStream(base_text) + delta_text = text_record_db.delta_db[self.id] + co.apply_diff(delta_text) + text = co.get_text() + del co + self.refcount -= 1 + if self.refcount == 0: + # This text will never be needed again; just delete ourselves + # without ever having stored the fulltext to the checkout + # database: + del text_record_db[self.id] + else: + # Store a new CheckedOutTextRecord in place of ourselves: + text_record_db.checkout_db['%x' % self.id] = text + new_text_record = CheckedOutTextRecord(self.id) + new_text_record.refcount = self.refcount + text_record_db.replace(new_text_record) + return text + + def free(self, text_record_db): + del text_record_db.delta_db[self.id] + text_record_db[self.pred_id].decrement_refcount(text_record_db) + + def __str__(self): + return 'DeltaTextRecord(%x -> %x, %d)' \ + % (self.pred_id, self.id, self.refcount,) + + +class CheckedOutTextRecord(TextRecord): + __slots__ = [] + + def __getstate__(self): + return (self.id, self.refcount,) + + def __setstate__(self, state): + (self.id, self.refcount,) = state + + def checkout(self, text_record_db): + text = text_record_db.checkout_db['%x' % self.id] + self.decrement_refcount(text_record_db) + return text + + def free(self, text_record_db): + del text_record_db.checkout_db['%x' % self.id] + + def __str__(self): + return 'CheckedOutTextRecord(%x, %d)' % (self.id, self.refcount,) + + +class NullDatabase(object): + """A do-nothing database that can be used with TextRecordDatabase. + + Use this when you don't actually want to allow anything to be + deleted.""" + + def __delitem__(self, id): + pass + + +class TextRecordDatabase: + """Holds the TextRecord instances that are currently live. + + During CollectRevsPass and FilterSymbolsPass, files are processed + one by one and a new TextRecordDatabase instance is used for each + file. During OutputPass, a single TextRecordDatabase instance is + used for the duration of OutputPass; individual records are added + and removed when they are active.""" + + def __init__(self, delta_db, checkout_db): + # A map { cvs_rev_id -> TextRecord }. + self.text_records = {} + + # A database-like object using cvs_rev_ids as keys and containing + # fulltext/deltatext strings as values. Its __getitem__() method + # is used to retrieve deltas when they are needed, and its + # __delitem__() method is used to delete deltas when they can be + # freed. The modifiability of the delta database varies from pass + # to pass, so the object stored here varies as well: + # + # CollectRevsPass: a fully-functional IndexedDatabase. This + # allows deltas that will not be needed to be deleted. + # + # FilterSymbolsPass: a NullDatabase. The delta database cannot be + # modified during this pass, and we have no need to retrieve + # deltas, so we just use a dummy object here. + # + # OutputPass: a disabled IndexedDatabase. During this pass we + # need to retrieve deltas, but we are not allowed to modify the + # delta database. So we use an IndexedDatabase whose __del__() + # method has been disabled to do nothing. + self.delta_db = delta_db + + # A database-like object using cvs_rev_ids as keys and containing + # fulltext strings as values. This database is only set during + # OutputPass. + self.checkout_db = checkout_db + + # If this is set to a list, then the list holds the ids of + # text_records that have to be deleted; when discard() is called, + # it adds the requested id to the list but does not delete it. If + # this member is set to None, then text_records are deleted + # immediately when discard() is called. + self.deferred_deletes = None + + def __getstate__(self): + return (self.text_records.values(),) + + def __setstate__(self, state): + (text_records,) = state + self.text_records = {} + for text_record in text_records: + self.add(text_record) + self.delta_db = NullDatabase() + self.checkout_db = NullDatabase() + self.deferred_deletes = None + + def add(self, text_record): + """Add TEXT_RECORD to our database. + + There must not already be a record with the same id.""" + + assert not self.text_records.has_key(text_record.id) + + self.text_records[text_record.id] = text_record + + def __getitem__(self, id): + return self.text_records[id] + + def __delitem__(self, id): + """Free the record with the specified ID.""" + + del self.text_records[id] + + def replace(self, text_record): + """Store TEXT_RECORD in place of the existing record with the same id. + + Do not do anything with the old record.""" + + assert self.text_records.has_key(text_record.id) + self.text_records[text_record.id] = text_record + + def discard(self, *ids): + """The text records with IDS are no longer needed; discard them. + + This involves calling their free() methods and also removing them + from SELF. + + If SELF.deferred_deletes is not None, then the ids to be deleted + are added to the list instead of deleted immediately. This + mechanism is to prevent a stack overflow from the avalanche of + deletes that can result from deleting a long chain of revisions.""" + + if self.deferred_deletes is None: + # This is an outer-level delete. + self.deferred_deletes = list(ids) + while self.deferred_deletes: + id = self.deferred_deletes.pop() + text_record = self[id] + if text_record.refcount != 0: + raise InternalError( + 'TextRecordDatabase.discard(%s) called with refcount = %d' + % (text_record, text_record.refcount,) + ) + # This call might cause other text_record ids to be added to + # self.deferred_deletes: + text_record.free(self) + del self[id] + self.deferred_deletes = None + else: + self.deferred_deletes.extend(ids) + + def itervalues(self): + return self.text_records.itervalues() + + def recompute_refcounts(self, cvs_file_items): + """Recompute the refcounts of the contained TextRecords. + + Use CVS_FILE_ITEMS to determine which records will be needed by + cvs2svn.""" + + # First clear all of the refcounts: + for text_record in self.itervalues(): + text_record.refcount = 0 + + # Now increment the reference count of records that are needed as + # the source of another record's deltas: + for text_record in self.itervalues(): + text_record.increment_dependency_refcounts(self.text_records) + + # Now increment the reference count of records that will be needed + # by cvs2svn: + for lod_items in cvs_file_items.iter_lods(): + for cvs_rev in lod_items.cvs_revisions: + if isinstance(cvs_rev, CVSRevisionModification): + self[cvs_rev.id].refcount += 1 + + def free_unused(self): + """Free any TextRecords whose reference counts are zero.""" + + # The deletion of some of these text records might cause others to + # be unused, in which case they will be deleted automatically. + # But since the initially-unused records are not referred to by + # any others, we don't have to be afraid that they will be deleted + # before we get to them. But it *is* crucial that we create the + # whole unused list before starting the loop. + + unused = [ + text_record.id + for text_record in self.itervalues() + if text_record.refcount == 0 + ] + + self.discard(*unused) + + def log_leftovers(self): + """If any TextRecords still exist, log them.""" + + if self.text_records: + Log().warn( + "%s: internal problem: leftover revisions in the checkout cache:" + % warning_prefix) + for text_record in self.itervalues(): + Log().warn(' %s' % (text_record,)) + + def __repr__(self): + """Debugging output of the current contents of the TextRecordDatabase.""" + + retval = ['TextRecordDatabase:'] + for text_record in self.itervalues(): + retval.append(' %s' % (text_record,)) + return '\n'.join(retval) + + +class InternalRevisionRecorder(RevisionRecorder): + """A RevisionRecorder that reconstructs the fulltext internally.""" + + def __init__(self, compress): + RevisionRecorder.__init__(self) + self._compress = compress + + def register_artifacts(self, which_pass): + artifact_manager.register_temp_file( + config.RCS_DELTAS_INDEX_TABLE, which_pass + ) + artifact_manager.register_temp_file(config.RCS_DELTAS_STORE, which_pass) + artifact_manager.register_temp_file( + config.RCS_TREES_INDEX_TABLE, which_pass + ) + artifact_manager.register_temp_file(config.RCS_TREES_STORE, which_pass) + + def start(self): + ser = MarshalSerializer() + if self._compress: + ser = CompressingSerializer(ser) + self._rcs_deltas = IndexedDatabase( + artifact_manager.get_temp_file(config.RCS_DELTAS_STORE), + artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE), + DB_OPEN_NEW, ser) + primer = (FullTextRecord, DeltaTextRecord) + self._rcs_trees = IndexedDatabase( + artifact_manager.get_temp_file(config.RCS_TREES_STORE), + artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE), + DB_OPEN_NEW, PrimedPickleSerializer(primer)) + + def start_file(self, cvs_file_items): + self._cvs_file_items = cvs_file_items + + # A map from cvs_rev_id to TextRecord instance: + self.text_record_db = TextRecordDatabase(self._rcs_deltas, NullDatabase()) + + def record_text(self, cvs_rev, log, text): + if isinstance(cvs_rev.lod, Trunk): + # On trunk, revisions are encountered in reverse order (1. + # ... 1.1) and deltas are inverted. The first text that we see + # is the fulltext for the HEAD revision. After that, the text + # corresponding to revision 1.N is the delta (1. -> + # 1.)). We have to invert the deltas here so that we can + # read the revisions out in dependency order; that is, for + # revision 1.1 we want the fulltext, and for revision 1. we + # want the delta (1. -> 1.). This means that we can't + # compute the delta for a revision until we see its logical + # parent. When we finally see revision 1.1 (which is recognized + # because it doesn't have a parent), we can record the diff (1.1 + # -> 1.2) for revision 1.2, and also the fulltext for 1.1. + + if cvs_rev.next_id is None: + # This is HEAD, as fulltext. Initialize the RCSStream so + # that we can compute deltas backwards in time. + self._stream = RCSStream(text) + else: + # Any other trunk revision is a backward delta. Apply the + # delta to the RCSStream to mutate it to the contents of this + # revision, and also to get the reverse delta, which we store + # as the forward delta of our child revision. + try: + text = self._stream.invert_diff(text) + except MalformedDeltaException, (msg): + Log().error('Malformed RCS delta in %s, revision %s: %s' + % (cvs_rev.cvs_file.get_filename(), cvs_rev.rev, + msg)) + raise RuntimeError + text_record = DeltaTextRecord(cvs_rev.next_id, cvs_rev.id) + self._writeout(text_record, text) + + if cvs_rev.prev_id is None: + # This is revision 1.1. Write its fulltext: + text_record = FullTextRecord(cvs_rev.id) + self._writeout(text_record, self._stream.get_text()) + + # There will be no more trunk revisions delivered, so free the + # RCSStream. + del self._stream + + else: + # On branches, revisions are encountered in logical order + # (.1 ... .) and the text corresponding to + # revision . is the forward delta (. -> + # .). That's what we need, so just store it. + + # FIXME: It would be nice to avoid writing out branch deltas + # when --trunk-only. (They will be deleted when finish_file() + # is called, but if the delta db is in an IndexedDatabase the + # deletions won't actually recover any disk space.) + text_record = DeltaTextRecord(cvs_rev.id, cvs_rev.prev_id) + self._writeout(text_record, text) + + return None + + def _writeout(self, text_record, text): + self.text_record_db.add(text_record) + self._rcs_deltas[text_record.id] = text + + def finish_file(self, cvs_file_items): + """Finish processing of the current file. + + Compute the initial text record refcounts, discard any records + that are unneeded, and store the text records for the file to the + _rcs_trees database.""" + + # Delete our copy of the preliminary CVSFileItems: + del self._cvs_file_items + + self.text_record_db.recompute_refcounts(cvs_file_items) + self.text_record_db.free_unused() + self._rcs_trees[cvs_file_items.cvs_file.id] = self.text_record_db + del self.text_record_db + + def finish(self): + self._rcs_deltas.close() + self._rcs_trees.close() + + +class InternalRevisionExcluder(RevisionExcluder): + """The RevisionExcluder used by InternalRevisionReader.""" + + def register_artifacts(self, which_pass): + artifact_manager.register_temp_file_needed( + config.RCS_TREES_STORE, which_pass + ) + artifact_manager.register_temp_file_needed( + config.RCS_TREES_INDEX_TABLE, which_pass + ) + artifact_manager.register_temp_file( + config.RCS_TREES_FILTERED_STORE, which_pass + ) + artifact_manager.register_temp_file( + config.RCS_TREES_FILTERED_INDEX_TABLE, which_pass + ) + + def start(self): + self._tree_db = IndexedDatabase( + artifact_manager.get_temp_file(config.RCS_TREES_STORE), + artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE), + DB_OPEN_READ) + primer = (FullTextRecord, DeltaTextRecord) + self._new_tree_db = IndexedDatabase( + artifact_manager.get_temp_file(config.RCS_TREES_FILTERED_STORE), + artifact_manager.get_temp_file(config.RCS_TREES_FILTERED_INDEX_TABLE), + DB_OPEN_NEW, PrimedPickleSerializer(primer)) + + def process_file(self, cvs_file_items): + text_record_db = self._tree_db[cvs_file_items.cvs_file.id] + text_record_db.recompute_refcounts(cvs_file_items) + text_record_db.free_unused() + self._new_tree_db[cvs_file_items.cvs_file.id] = text_record_db + + def finish(self): + self._tree_db.close() + self._new_tree_db.close() + + +class _KeywordExpander: + """A class whose instances provide substitutions for CVS keywords. + + This class is used via its __call__() method, which should be called + with a match object representing a match for a CVS keyword string. + The method returns the replacement for the matched text. + + The __call__() method works by calling the method with the same name + as that of the CVS keyword (converted to lower case). + + Instances of this class can be passed as the REPL argument to + re.sub().""" + + date_fmt_old = "%Y/%m/%d %H:%M:%S" # CVS 1.11, rcs + date_fmt_new = "%Y-%m-%d %H:%M:%S" # CVS 1.12 + + date_fmt = date_fmt_new + + @classmethod + def use_old_date_format(klass): + """Class method to ensure exact compatibility with CVS 1.11 + output. Use this if you want to verify your conversion and you're + using CVS 1.11.""" + klass.date_fmt = klass.date_fmt_old + + def __init__(self, cvs_rev): + self.cvs_rev = cvs_rev + + def __call__(self, match): + return '$%s: %s $' % \ + (match.group(1), getattr(self, match.group(1).lower())(),) + + def author(self): + return Ctx()._metadata_db[self.cvs_rev.metadata_id].original_author + + def date(self): + return time.strftime(self.date_fmt, + time.gmtime(self.cvs_rev.timestamp)) + + def header(self): + return '%s %s %s %s Exp' % \ + (self.source(), self.cvs_rev.rev, self.date(), self.author()) + + def id(self): + return '%s %s %s %s Exp' % \ + (self.rcsfile(), self.cvs_rev.rev, self.date(), self.author()) + + def locker(self): + # Handle kvl like kv, as a converted repo is supposed to have no + # locks. + return '' + + def log(self): + # Would need some special handling. + return 'not supported by cvs2svn' + + def name(self): + # Cannot work, as just creating a new symbol does not check out + # the revision again. + return 'not supported by cvs2svn' + + def rcsfile(self): + return self.cvs_rev.cvs_file.basename + ",v" + + def revision(self): + return self.cvs_rev.rev + + def source(self): + project = self.cvs_rev.cvs_file.project + return project.cvs_repository_root + '/' + project.cvs_module + \ + self.cvs_rev.cvs_file.cvs_path + ",v" + + def state(self): + # We check out only live revisions. + return 'Exp' + + +class InternalRevisionReader(RevisionReader): + """A RevisionReader that reads the contents from an own delta store.""" + + _kws = 'Author|Date|Header|Id|Locker|Log|Name|RCSfile|Revision|Source|State' + _kw_re = re.compile(r'\$(' + _kws + r'):[^$\n]*\$') + _kwo_re = re.compile(r'\$(' + _kws + r')(:[^$\n]*)?\$') + + def __init__(self, compress): + self._compress = compress + + def register_artifacts(self, which_pass): + artifact_manager.register_temp_file(config.CVS_CHECKOUT_DB, which_pass) + artifact_manager.register_temp_file_needed( + config.RCS_DELTAS_STORE, which_pass + ) + artifact_manager.register_temp_file_needed( + config.RCS_DELTAS_INDEX_TABLE, which_pass + ) + artifact_manager.register_temp_file_needed( + config.RCS_TREES_FILTERED_STORE, which_pass + ) + artifact_manager.register_temp_file_needed( + config.RCS_TREES_FILTERED_INDEX_TABLE, which_pass + ) + + def start(self): + self._delta_db = IndexedDatabase( + artifact_manager.get_temp_file(config.RCS_DELTAS_STORE), + artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE), + DB_OPEN_READ) + self._delta_db.__delitem__ = lambda id: None + self._tree_db = IndexedDatabase( + artifact_manager.get_temp_file(config.RCS_TREES_FILTERED_STORE), + artifact_manager.get_temp_file(config.RCS_TREES_FILTERED_INDEX_TABLE), + DB_OPEN_READ) + ser = MarshalSerializer() + if self._compress: + ser = CompressingSerializer(ser) + self._co_db = Database( + artifact_manager.get_temp_file(config.CVS_CHECKOUT_DB), DB_OPEN_NEW, + ser) + + # The set of CVSFile instances whose TextRecords have already been + # read: + self._loaded_files = set() + + # A map { CVSFILE : _FileTree } for files that currently have live + # revisions: + self._text_record_db = TextRecordDatabase(self._delta_db, self._co_db) + + def _get_text_record(self, cvs_rev): + """Return the TextRecord instance for CVS_REV. + + If the TextRecords for CVS_REV.cvs_file haven't been loaded yet, + do so now.""" + + if cvs_rev.cvs_file not in self._loaded_files: + for text_record in self._tree_db[cvs_rev.cvs_file.id].itervalues(): + self._text_record_db.add(text_record) + self._loaded_files.add(cvs_rev.cvs_file) + + return self._text_record_db[cvs_rev.id] + + def get_content_stream(self, cvs_rev, suppress_keyword_substitution=False): + """Check out the text for revision C_REV from the repository. + + Return the text wrapped in a readable file object. If + SUPPRESS_KEYWORD_SUBSTITUTION is True, any RCS keywords will be + _un_expanded prior to returning the file content. Note that $Log$ + never actually generates a log (which makes test 'requires_cvs()' + fail). + + Revisions may be requested in any order, but if they are not + requested in dependency order the checkout database will become + very large. Revisions may be skipped. Each revision may be + requested only once.""" + + try: + text = self._get_text_record(cvs_rev).checkout(self._text_record_db) + except MalformedDeltaException, (msg): + raise FatalError('Malformed RCS delta in %s, revision %s: %s' + % (cvs_rev.cvs_file.get_filename(), cvs_rev.rev, msg)) + if cvs_rev.cvs_file.mode != 'b' and cvs_rev.cvs_file.mode != 'o': + if suppress_keyword_substitution or cvs_rev.cvs_file.mode == 'k': + text = self._kw_re.sub(r'$\1$', text) + else: + text = self._kwo_re.sub(_KeywordExpander(cvs_rev), text) + + return cStringIO.StringIO(text) + + def finish(self): + self._text_record_db.log_leftovers() + + del self._text_record_db + self._delta_db.close() + self._tree_db.close() + self._co_db.close() + diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py new file mode 100644 index 0000000..160d7b9 --- /dev/null +++ b/cvs2svn_lib/collect_data.py @@ -0,0 +1,1431 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Data collection classes. + +This module contains the code used to collect data from the CVS +repository. It parses *,v files, recording all useful information +except for the actual file contents (though even the file contents +might be recorded by the RevisionRecorder if one is configured). + +As a *,v file is parsed, the information pertaining to the file is +accumulated in memory, mostly in _RevisionData, _BranchData, and +_TagData objects. When parsing is complete, a final pass is made over +the data to create some final dependency links, collect statistics, +etc., then the _*Data objects are converted into CVSItem objects +(CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are +dumped into databases. + +During the data collection, persistent unique ids are allocated to +many types of objects: CVSFile, Symbol, and CVSItems. CVSItems are a +special case. CVSItem ids are unique across all CVSItem types, and +the ids are carried over from the corresponding data collection +objects: + + _RevisionData -> CVSRevision + + _BranchData -> CVSBranch + + _TagData -> CVSTag + +In a later pass it is possible to convert tags <-> branches. But even +if this occurs, the new branch or tag uses the same id as the old tag +or branch. + +""" + + +import os +import stat +import re + +from cvs2svn_lib import config +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import IllegalSVNPathError +from cvs2svn_lib.common import verify_svn_filename_legal +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.project import FileInAndOutOfAtticException +from cvs2svn_lib.cvs_file import CVSPath +from cvs2svn_lib.cvs_file import CVSDirectory +from cvs2svn_lib.cvs_file import CVSFile +from cvs2svn_lib.symbol import Symbol +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.cvs_item import CVSRevision +from cvs2svn_lib.cvs_item import CVSBranch +from cvs2svn_lib.cvs_item import CVSTag +from cvs2svn_lib.cvs_item import cvs_revision_type_map +from cvs2svn_lib.cvs_file_items import VendorBranchError +from cvs2svn_lib.cvs_file_items import CVSFileItems +from cvs2svn_lib.key_generator import KeyGenerator +from cvs2svn_lib.cvs_item_database import NewCVSItemStore +from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector +from cvs2svn_lib.metadata_database import MetadataDatabase +from cvs2svn_lib.metadata_database import MetadataLogger + +import cvs2svn_rcsparse + + +# A regular expression defining "valid" revision numbers (used to +# check that symbol definitions are reasonable). +_valid_revision_re = re.compile(r''' + ^ + (?:\d+\.)+ # Digit groups with trailing dots + \d+ # And the last digit group. + $ + ''', re.VERBOSE) + +_branch_revision_re = re.compile(r''' + ^ + ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot + (?:0\.)? # CVS sticks an extra 0 here; RCS does not + (\d+) # And the last digit group + $ + ''', re.VERBOSE) + + +def rev_tuple(rev): + """Return a tuple of integers corresponding to revision number REV. + + For example, if REV is '1.2.3.4', then return (1,2,3,4).""" + + return tuple([int(x) for x in rev.split('.')]) + + +def is_trunk_revision(rev): + """Return True iff REV is a trunk revision. + + REV is a revision number corresponding to a specific revision (i.e., + not a whole branch).""" + + return rev.count('.') == 1 + + +def is_branch_revision_number(rev): + """Return True iff REV is a branch revision number. + + REV is a CVS revision number in canonical form (i.e., with zeros + removed). Return True iff it refers to a whole branch, as opposed + to a single revision.""" + + return rev.count('.') % 2 == 0 + + +def is_same_line_of_development(rev1, rev2): + """Return True if rev1 and rev2 are on the same line of + development (i.e., both on trunk, or both on the same branch); + return False otherwise. Either rev1 or rev2 can be None, in + which case automatically return False.""" + + if rev1 is None or rev2 is None: + return False + if rev1.count('.') == 1 and rev2.count('.') == 1: + return True + if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]: + return True + return False + + +class _RevisionData: + """We track the state of each revision so that in set_revision_info, + we can determine if our op is an add/change/delete. We can do this + because in set_revision_info, we'll have all of the _RevisionData + for a file at our fingertips, and we need to examine the state of + our prev_rev to determine if we're an add or a change. Without the + state of the prev_rev, we are unable to distinguish between an add + and a change.""" + + def __init__(self, cvs_rev_id, rev, timestamp, author, state): + # The id of this revision: + self.cvs_rev_id = cvs_rev_id + self.rev = rev + self.timestamp = timestamp + self.author = author + self.original_timestamp = timestamp + self.state = state + + # If this is the first revision on a branch, then this is the + # branch_data of that branch; otherwise it is None. + self.parent_branch_data = None + + # The revision number of the parent of this revision along the + # same line of development, if any. For the first revision R on a + # branch, we consider the revision from which R sprouted to be the + # 'parent'. If this is the root revision in the file's revision + # tree, then this field is None. + # + # Note that this revision can't be determined arithmetically (due + # to cvsadmin -o), which is why this field is necessary. + self.parent = None + + # The revision number of the primary child of this revision (the + # child along the same line of development), if any; otherwise, + # None. + self.child = None + + # The _BranchData instances of branches that sprout from this + # revision, sorted in ascending order by branch number. It would + # be inconvenient to initialize it here because we would have to + # scan through all branches known by the _SymbolDataCollector to + # find the ones having us as the parent. Instead, this + # information is filled in by + # _FileDataCollector._resolve_dependencies() and sorted by + # _FileDataCollector._sort_branches(). + self.branches_data = [] + + # The revision numbers of the first commits on any branches on + # which commits occurred. This dependency is kept explicitly + # because otherwise a revision-only topological sort would miss + # the dependency that exists via branches_data. + self.branches_revs_data = [] + + # The _TagData instances of tags that are connected to this + # revision. + self.tags_data = [] + + # A token that may be returned from + # RevisionRecorder.record_text(). It can be used by + # RevisionReader to obtain the text again. + self.revision_recorder_token = None + + def get_first_on_branch_id(self): + return self.parent_branch_data and self.parent_branch_data.id + + +class _SymbolData: + """Collection area for information about a symbol in a single CVSFile. + + SYMBOL is an instance of Symbol, undifferentiated as a Branch or a + Tag regardless of whether self is a _BranchData or a _TagData.""" + + def __init__(self, id, symbol): + """Initialize an object for SYMBOL.""" + + # The unique id that will be used for this particular symbol in + # this particular file. This same id will be used for the CVSItem + # that is derived from this instance. + self.id = id + + # An instance of Symbol. + self.symbol = symbol + + +class _BranchData(_SymbolData): + """Collection area for information about a Branch in a single CVSFile.""" + + def __init__(self, id, symbol, branch_number): + _SymbolData.__init__(self, id, symbol) + + # The branch number (e.g., '1.5.2') of this branch. + self.branch_number = branch_number + + # The revision number of the revision from which this branch + # sprouts (e.g., '1.5'). + self.parent = self.branch_number[:self.branch_number.rindex(".")] + + # The revision number of the first commit on this branch, if any + # (e.g., '1.5.2.1'); otherwise, None. + self.child = None + + +class _TagData(_SymbolData): + """Collection area for information about a Tag in a single CVSFile.""" + + def __init__(self, id, symbol, rev): + _SymbolData.__init__(self, id, symbol) + + # The revision number being tagged (e.g., '1.5.2.3'). + self.rev = rev + + +class _SymbolDataCollector(object): + """Collect information about symbols in a single CVSFile.""" + + def __init__(self, fdc, cvs_file): + self.fdc = fdc + self.cvs_file = cvs_file + + self.pdc = self.fdc.pdc + self.collect_data = self.fdc.collect_data + + # A list [(name, revision), ...] of symbols defined in the header + # of the file. The name has already been transformed using the + # symbol transform rules. If the symbol transform rules indicate + # that the symbol should be ignored, then it is never added to + # this list. This list is processed then deleted in + # process_symbols(). + self._symbol_defs = [] + + # A set containing the transformed names of symbols in this file + # (used to detect duplicats during processing of unlabeled + # branches): + self._defined_symbols = set() + + # Map { branch_number : _BranchData }, where branch_number has an + # odd number of digits. + self.branches_data = { } + + # Map { revision : [ tag_data ] }, where revision has an even + # number of digits, and the value is a list of _TagData objects + # for tags that apply to that revision. + self.tags_data = { } + + def _add_branch(self, name, branch_number): + """Record that BRANCH_NUMBER is the branch number for branch NAME, + and derive and record the revision from which NAME sprouts. + BRANCH_NUMBER is an RCS branch number with an odd number of + components, for example '1.7.2' (never '1.7.0.2'). Return the + _BranchData instance (which is usually newly-created).""" + + branch_data = self.branches_data.get(branch_number) + + if branch_data is not None: + Log().warn( + "%s: in '%s':\n" + " branch '%s' already has name '%s',\n" + " cannot also have name '%s', ignoring the latter\n" + % (warning_prefix, + self.cvs_file.filename, branch_number, + branch_data.symbol.name, name) + ) + return branch_data + + symbol = self.pdc.get_symbol(name) + branch_data = _BranchData( + self.collect_data.item_key_generator.gen_id(), symbol, branch_number + ) + self.branches_data[branch_number] = branch_data + return branch_data + + def _construct_distinct_name(self, name, original_name): + """Construct a distinct symbol name from NAME. + + If NAME is distinct, return it. If it is already used in this + file (as determined from its presence in self._defined_symbols), + construct and return a new name that is not already used.""" + + if name not in self._defined_symbols: + return name + else: + index = 1 + while True: + dup_name = '%s-DUPLICATE-%d' % (name, index,) + if dup_name not in self._defined_symbols: + self.collect_data.record_fatal_error( + "Symbol name '%s' is already used in '%s'.\n" + "The unlabeled branch '%s' must be renamed using " + "--symbol-transform." + % (name, self.cvs_file.filename, original_name,) + ) + return dup_name + + def _add_unlabeled_branch(self, branch_number): + original_name = "unlabeled-" + branch_number + name = self.transform_symbol(original_name, branch_number) + if name is None: + self.collect_data.record_fatal_error( + "The unlabeled branch '%s' in '%s' contains commits.\n" + "It may not be ignored via a symbol transform. (Use --exclude " + "instead.)" + % (original_name, self.cvs_file.filename,) + ) + # Retain the original name to allow the conversion to continue: + name = original_name + + distinct_name = self._construct_distinct_name(name, original_name) + self._defined_symbols.add(distinct_name) + return self._add_branch(distinct_name, branch_number) + + def _add_tag(self, name, revision): + """Record that tag NAME refers to the specified REVISION.""" + + symbol = self.pdc.get_symbol(name) + tag_data = _TagData( + self.collect_data.item_key_generator.gen_id(), symbol, revision + ) + self.tags_data.setdefault(revision, []).append(tag_data) + return tag_data + + def transform_symbol(self, name, revision): + """Transform a symbol according to the project's symbol transforms. + + Transform the symbol with the original name NAME and canonicalized + revision number REVISION. Return the new symbol name or None if + the symbol should be ignored entirely. + + Log the results of the symbol transform if necessary.""" + + old_name = name + # Apply any user-defined symbol transforms to the symbol name: + name = self.cvs_file.project.transform_symbol( + self.cvs_file, name, revision + ) + + if name is None: + # Ignore symbol: + self.pdc.log_symbol_transform(old_name, None) + Log().verbose( + " symbol '%s'=%s ignored in %s" + % (old_name, revision, self.cvs_file.filename,) + ) + else: + if name != old_name: + self.pdc.log_symbol_transform(old_name, name) + Log().verbose( + " symbol '%s'=%s transformed to '%s' in %s" + % (old_name, revision, name, self.cvs_file.filename,) + ) + + return name + + def define_symbol(self, name, revision): + """Record a symbol definition for later processing.""" + + # Canonicalize the revision number: + revision = _branch_revision_re.sub(r'\1\2', revision) + + # Apply any user-defined symbol transforms to the symbol name: + name = self.transform_symbol(name, revision) + + if name is not None: + # Verify that the revision number is valid: + if _valid_revision_re.match(revision): + # The revision number is valid; record it for later processing: + self._symbol_defs.append( (name, revision) ) + else: + Log().warn( + 'In %r:\n' + ' branch %r references invalid revision %s\n' + ' and will be ignored.' + % (self.cvs_file.filename, name, revision,) + ) + + def _eliminate_trivial_duplicate_defs(self, symbol_defs): + """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions. + + Duplicate definitions of symbol names have been seen in the wild, + and they can also happen when --symbol-transform is used. If a + symbol is defined to the same revision number repeatedly, then + ignore all but the last definition.""" + + # Make a copy, since we have to iterate through the definitions + # twice: + symbol_defs = list(symbol_defs) + + # A map { (name, revision) : [index,...] } of the indexes where + # symbol definitions name=revision were found: + known_definitions = {} + for (i, symbol_def) in enumerate(symbol_defs): + known_definitions.setdefault(symbol_def, []).append(i) + + # A set of the indexes of entries that have to be removed from + # symbol_defs: + dup_indexes = set() + for ((name, revision), indexes) in known_definitions.iteritems(): + if len(indexes) > 1: + Log().verbose( + "in %r:\n" + " symbol %s:%s defined multiple times; ignoring duplicates\n" + % (self.cvs_file.filename, name, revision,) + ) + dup_indexes.update(indexes[:-1]) + + for (i, symbol_def) in enumerate(symbol_defs): + if i not in dup_indexes: + yield symbol_def + + def _process_duplicate_defs(self, symbol_defs): + """Iterate through SYMBOL_DEFS, processing duplicate names. + + Duplicate definitions of symbol names have been seen in the wild, + and they can also happen when --symbol-transform is used. If a + symbol is defined multiple times, then it is a fatal error. This + method should be called after _eliminate_trivial_duplicate_defs().""" + + # Make a copy, since we have to access multiple times: + symbol_defs = list(symbol_defs) + + # A map {name : [index,...]} mapping the names of symbols to a + # list of their definitions' indexes in symbol_defs: + known_symbols = {} + for (i, (name, revision)) in enumerate(symbol_defs): + known_symbols.setdefault(name, []).append(i) + + known_symbols = known_symbols.items() + known_symbols.sort() + dup_indexes = set() + for (name, indexes) in known_symbols: + if len(indexes) > 1: + # This symbol was defined multiple times. + self.collect_data.record_fatal_error( + "Multiple definitions of the symbol '%s' in '%s': %s" % ( + name, self.cvs_file.filename, + ' '.join([symbol_defs[i][1] for i in indexes]), + ) + ) + # Ignore all but the last definition for now, to allow the + # conversion to proceed: + dup_indexes.update(indexes[:-1]) + + for (i, symbol_def) in enumerate(symbol_defs): + if i not in dup_indexes: + yield symbol_def + + def _process_symbol(self, name, revision): + """Process a symbol called NAME, which is associated with REVISON. + + REVISION is a canonical revision number with zeros removed, for + example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'. NAME is a + transformed branch or tag name.""" + + # Add symbol to our records: + if is_branch_revision_number(revision): + self._add_branch(name, revision) + else: + self._add_tag(name, revision) + + def process_symbols(self): + """Process the symbol definitions from SELF._symbol_defs.""" + + symbol_defs = self._symbol_defs + del self._symbol_defs + + symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs) + symbol_defs = self._process_duplicate_defs(symbol_defs) + + for (name, revision) in symbol_defs: + self._defined_symbols.add(name) + self._process_symbol(name, revision) + + @staticmethod + def rev_to_branch_number(revision): + """Return the branch_number of the branch on which REVISION lies. + + REVISION is a branch revision number with an even number of + components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2'). + The return value is the branch number (for example, '1.7.2'). + Return none iff REVISION is a trunk revision such as '1.2'.""" + + if is_trunk_revision(revision): + return None + return revision[:revision.rindex(".")] + + def rev_to_branch_data(self, revision): + """Return the branch_data of the branch on which REVISION lies. + + REVISION must be a branch revision number with an even number of + components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2'). + Raise KeyError iff REVISION is unknown.""" + + assert not is_trunk_revision(revision) + + return self.branches_data[self.rev_to_branch_number(revision)] + + def rev_to_lod(self, revision): + """Return the line of development on which REVISION lies. + + REVISION must be a revision number with an even number of + components. Raise KeyError iff REVISION is unknown.""" + + if is_trunk_revision(revision): + return self.pdc.trunk + else: + return self.rev_to_branch_data(revision).symbol + + +class _FileDataCollector(cvs2svn_rcsparse.Sink): + """Class responsible for collecting RCS data for a particular file. + + Any collected data that need to be remembered are stored into the + referenced CollectData instance.""" + + def __init__(self, pdc, cvs_file): + """Create an object that is prepared to receive data for CVS_FILE. + CVS_FILE is a CVSFile instance. COLLECT_DATA is used to store the + information collected about the file.""" + + self.pdc = pdc + self.cvs_file = cvs_file + + self.collect_data = self.pdc.collect_data + self.project = self.cvs_file.project + + # A place to store information about the symbols in this file: + self.sdc = _SymbolDataCollector(self, self.cvs_file) + + # { revision : _RevisionData instance } + self._rev_data = { } + + # Lists [ (parent, child) ] of revision number pairs indicating + # that revision child depends on revision parent along the main + # line of development. + self._primary_dependencies = [] + + # If set, this is an RCS branch number -- rcsparse calls this the + # "principal branch", but CVS and RCS refer to it as the "default + # branch", so that's what we call it, even though the rcsparse API + # setter method is still 'set_principal_branch'. + self.default_branch = None + + # True iff revision 1.1 of the file appears to have been imported + # (as opposed to added normally). + self._file_imported = False + + def _get_rev_id(self, revision): + if revision is None: + return None + return self._rev_data[revision].cvs_rev_id + + def set_principal_branch(self, branch): + """This is a callback method declared in Sink.""" + + if branch.find('.') == -1: + # This just sets the default branch to trunk. Normally this + # shouldn't occur, but it has been seen in at least one CVS + # repository. Just ignore it. + pass + else: + self.default_branch = branch + + def set_expansion(self, mode): + """This is a callback method declared in Sink.""" + + self.cvs_file.mode = mode + + def define_tag(self, name, revision): + """Remember the symbol name and revision, but don't process them yet. + + This is a callback method declared in Sink.""" + + self.sdc.define_symbol(name, revision) + + def admin_completed(self): + """This is a callback method declared in Sink.""" + + self.sdc.process_symbols() + + def define_revision(self, revision, timestamp, author, state, + branches, next): + """This is a callback method declared in Sink.""" + + for branch in branches: + try: + branch_data = self.sdc.rev_to_branch_data(branch) + except KeyError: + # Normally we learn about the branches from the branch names + # and numbers parsed from the symbolic name header. But this + # must have been an unlabeled branch that slipped through the + # net. Generate a name for it and create a _BranchData record + # for it now. + branch_data = self.sdc._add_unlabeled_branch( + self.sdc.rev_to_branch_number(branch)) + + assert branch_data.child is None + branch_data.child = branch + + if revision in self._rev_data: + # This revision has already been seen. + Log().error('File %r contains duplicate definitions of revision %s.' + % (self.cvs_file.filename, revision,)) + raise RuntimeError + + # Record basic information about the revision: + rev_data = _RevisionData( + self.collect_data.item_key_generator.gen_id(), + revision, int(timestamp), author, state) + self._rev_data[revision] = rev_data + + # When on trunk, the RCS 'next' revision number points to what + # humans might consider to be the 'previous' revision number. For + # example, 1.3's RCS 'next' is 1.2. + # + # However, on a branch, the RCS 'next' revision number really does + # point to what humans would consider to be the 'next' revision + # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2. + # + # In other words, in RCS, 'next' always means "where to find the next + # deltatext that you need this revision to retrieve. + # + # That said, we don't *want* RCS's behavior here, so we determine + # whether we're on trunk or a branch and set the dependencies + # accordingly. + if next: + if is_trunk_revision(revision): + self._primary_dependencies.append( (next, revision,) ) + else: + self._primary_dependencies.append( (revision, next,) ) + + def _resolve_primary_dependencies(self): + """Resolve the dependencies listed in self._primary_dependencies.""" + + for (parent, child,) in self._primary_dependencies: + parent_data = self._rev_data[parent] + assert parent_data.child is None + parent_data.child = child + + child_data = self._rev_data[child] + assert child_data.parent is None + child_data.parent = parent + + def _resolve_branch_dependencies(self): + """Resolve dependencies involving branches.""" + + for branch_data in self.sdc.branches_data.values(): + # The branch_data's parent has the branch as a child regardless + # of whether the branch had any subsequent commits: + try: + parent_data = self._rev_data[branch_data.parent] + except KeyError: + Log().warn( + 'In %r:\n' + ' branch %r references non-existing revision %s\n' + ' and will be ignored.' + % (self.cvs_file.filename, branch_data.symbol.name, + branch_data.parent,)) + del self.sdc.branches_data[branch_data.branch_number] + else: + parent_data.branches_data.append(branch_data) + + # If the branch has a child (i.e., something was committed on + # the branch), then we store a reference to the branch_data + # there, define the child's parent to be the branch's parent, + # and list the child in the branch parent's branches_revs_data: + if branch_data.child is not None: + child_data = self._rev_data[branch_data.child] + assert child_data.parent_branch_data is None + child_data.parent_branch_data = branch_data + assert child_data.parent is None + child_data.parent = branch_data.parent + parent_data.branches_revs_data.append(branch_data.child) + + def _sort_branches(self): + """Sort the branches sprouting from each revision in creation order. + + Creation order is taken to be the reverse of the order that they + are listed in the symbols part of the RCS file. (If a branch is + created then deleted, a later branch can be assigned the recycled + branch number; therefore branch numbers are not an indication of + creation order.)""" + + for rev_data in self._rev_data.values(): + rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id)) + + def _resolve_tag_dependencies(self): + """Resolve dependencies involving tags.""" + + for (rev, tag_data_list) in self.sdc.tags_data.items(): + try: + parent_data = self._rev_data[rev] + except KeyError: + Log().warn( + 'In %r:\n' + ' the following tag(s) reference non-existing revision %s\n' + ' and will be ignored:\n' + ' %s' % ( + self.cvs_file.filename, rev, + ', '.join([repr(tag_data.symbol.name) + for tag_data in tag_data_list]),)) + del self.sdc.tags_data[rev] + else: + for tag_data in tag_data_list: + assert tag_data.rev == rev + # The tag_data's rev has the tag as a child: + parent_data.tags_data.append(tag_data) + + def _determine_operation(self, rev_data): + prev_rev_data = self._rev_data.get(rev_data.parent) + return cvs_revision_type_map[( + rev_data.state != 'dead', + prev_rev_data is not None and prev_rev_data.state != 'dead', + )] + + def _get_cvs_revision(self, rev_data): + """Create and return a CVSRevision for REV_DATA.""" + + branch_ids = [ + branch_data.id + for branch_data in rev_data.branches_data + ] + + branch_commit_ids = [ + self._get_rev_id(rev) + for rev in rev_data.branches_revs_data + ] + + tag_ids = [ + tag_data.id + for tag_data in rev_data.tags_data + ] + + revision_type = self._determine_operation(rev_data) + + return revision_type( + self._get_rev_id(rev_data.rev), self.cvs_file, + rev_data.timestamp, None, + self._get_rev_id(rev_data.parent), + self._get_rev_id(rev_data.child), + rev_data.rev, + True, + self.sdc.rev_to_lod(rev_data.rev), + rev_data.get_first_on_branch_id(), + False, None, None, + tag_ids, branch_ids, branch_commit_ids, + rev_data.revision_recorder_token) + + def _get_cvs_revisions(self): + """Generate the CVSRevisions present in this file.""" + + for rev_data in self._rev_data.itervalues(): + yield self._get_cvs_revision(rev_data) + + def _get_cvs_branches(self): + """Generate the CVSBranches present in this file.""" + + for branch_data in self.sdc.branches_data.values(): + yield CVSBranch( + branch_data.id, self.cvs_file, branch_data.symbol, + branch_data.branch_number, + self.sdc.rev_to_lod(branch_data.parent), + self._get_rev_id(branch_data.parent), + self._get_rev_id(branch_data.child), + None, + ) + + def _get_cvs_tags(self): + """Generate the CVSTags present in this file.""" + + for tags_data in self.sdc.tags_data.values(): + for tag_data in tags_data: + yield CVSTag( + tag_data.id, self.cvs_file, tag_data.symbol, + self.sdc.rev_to_lod(tag_data.rev), + self._get_rev_id(tag_data.rev), + None, + ) + + def tree_completed(self): + """The revision tree has been parsed. + + Analyze it for consistency and connect some loose ends. + + This is a callback method declared in Sink.""" + + self._resolve_primary_dependencies() + self._resolve_branch_dependencies() + self._sort_branches() + self._resolve_tag_dependencies() + + # Compute the preliminary CVSFileItems for this file: + cvs_items = [] + cvs_items.extend(self._get_cvs_revisions()) + cvs_items.extend(self._get_cvs_branches()) + cvs_items.extend(self._get_cvs_tags()) + self._cvs_file_items = CVSFileItems( + self.cvs_file, self.pdc.trunk, cvs_items + ) + + self._cvs_file_items.check_link_consistency() + + # Tell the revision recorder about the file dependency tree. + self.collect_data.revision_recorder.start_file(self._cvs_file_items) + + def set_revision_info(self, revision, log, text): + """This is a callback method declared in Sink.""" + + rev_data = self._rev_data[revision] + cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id] + + if cvs_rev.metadata_id is not None: + # Users have reported problems with repositories in which the + # deltatext block for revision 1.1 appears twice. It is not + # known whether this results from a CVS/RCS bug, or from botched + # hand-editing of the repository. In any case, empirically, cvs + # and rcs both use the first version when checking out data, so + # that's what we will do. (For the record: "cvs log" fails on + # such a file; "rlog" prints the log message from the first + # block and ignores the second one.) + Log().warn( + "%s: in '%s':\n" + " Deltatext block for revision %s appeared twice;\n" + " ignoring the second occurrence.\n" + % (warning_prefix, self.cvs_file.filename, revision,) + ) + return + + if is_trunk_revision(revision): + branch_name = None + else: + branch_name = self.sdc.rev_to_branch_data(revision).symbol.name + + cvs_rev.metadata_id = self.collect_data.metadata_logger.store( + self.project, branch_name, rev_data.author, log + ) + cvs_rev.deltatext_exists = bool(text) + + # If this is revision 1.1, determine whether the file appears to + # have been created via 'cvs add' instead of 'cvs import'. The + # test is that the log message CVS uses for 1.1 in imports is + # "Initial revision\n" with no period. (This fact helps determine + # whether this file might have had a default branch in the past.) + if revision == '1.1': + self._file_imported = (log == 'Initial revision\n') + + cvs_rev.revision_recorder_token = \ + self.collect_data.revision_recorder.record_text(cvs_rev, log, text) + + def parse_completed(self): + """Finish the processing of this file. + + This is a callback method declared in Sink.""" + + # Make sure that there was an info section for each revision: + for cvs_item in self._cvs_file_items.values(): + if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None: + self.collect_data.record_fatal_error( + '%r has no deltatext section for revision %s' + % (self.cvs_file.filename, cvs_item.rev,) + ) + + def _process_ntdbrs(self): + """Fix up any non-trunk default branch revisions (if present). + + If a non-trunk default branch is determined to have existed, yield + the _RevisionData.ids for all revisions that were once non-trunk + default revisions, in dependency order. + + There are two cases to handle: + + One case is simple. The RCS file lists a default branch + explicitly in its header, such as '1.1.1'. In this case, we know + that every revision on the vendor branch is to be treated as head + of trunk at that point in time. + + But there's also a degenerate case. The RCS file does not + currently have a default branch, yet we can deduce that for some + period in the past it probably *did* have one. For example, the + file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are + dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated + after 1.2. In this case, we should record 1.1.1.96 as the last + vendor revision to have been the head of the default branch. + + If any non-trunk default branch revisions are found: + + - Set their ntdbr members to True. + + - Connect the last one with revision 1.2. + + - Remove revision 1.1 if it is not needed. + + """ + + try: + if self.default_branch: + vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id + vendor_lod_items = self._cvs_file_items.get_lod_items( + self._cvs_file_items[vendor_cvs_branch_id] + ) + if not self._cvs_file_items.process_live_ntdb(vendor_lod_items): + return + elif self._file_imported: + vendor_branch_data = self.sdc.branches_data.get('1.1.1') + if vendor_branch_data is None: + return + else: + vendor_lod_items = self._cvs_file_items.get_lod_items( + self._cvs_file_items[vendor_branch_data.id] + ) + if not self._cvs_file_items.process_historical_ntdb( + vendor_lod_items + ): + return + else: + return + except VendorBranchError, e: + self.collect_data.record_fatal_error(str(e)) + return + + if self._file_imported: + self._cvs_file_items.imported_remove_1_1(vendor_lod_items) + + self._cvs_file_items.check_link_consistency() + + def get_cvs_file_items(self): + """Finish up and return a CVSFileItems instance for this file. + + This method must only be called once.""" + + self._process_ntdbrs() + + # Break a circular reference loop, allowing the memory for self + # and sdc to be freed. + del self.sdc + + return self._cvs_file_items + + +class _ProjectDataCollector: + def __init__(self, collect_data, project): + self.collect_data = collect_data + self.project = project + self.num_files = 0 + + # The Trunk LineOfDevelopment object for this project: + self.trunk = Trunk( + self.collect_data.symbol_key_generator.gen_id(), self.project + ) + self.project.trunk_id = self.trunk.id + + # This causes a record for self.trunk to spring into existence: + self.collect_data.symbol_stats[self.trunk] + + # A map { name -> Symbol } for all known symbols in this project. + # The symbols listed here are undifferentiated into Branches and + # Tags because the same name might appear as a branch in one file + # and a tag in another. + self.symbols = {} + + # A map { (old_name, new_name) : count } indicating how many files + # were affected by each each symbol name transformation: + self.symbol_transform_counts = {} + + def get_symbol(self, name): + """Return the Symbol object for the symbol named NAME in this project. + + If such a symbol does not yet exist, allocate a new symbol_id, + create a Symbol instance, store it in self.symbols, and return it.""" + + symbol = self.symbols.get(name) + if symbol is None: + symbol = Symbol( + self.collect_data.symbol_key_generator.gen_id(), + self.project, name) + self.symbols[name] = symbol + return symbol + + def log_symbol_transform(self, old_name, new_name): + """Record that OLD_NAME was transformed to NEW_NAME in one file. + + This information is used to generated a statistical summary of + symbol transforms.""" + + try: + self.symbol_transform_counts[old_name, new_name] += 1 + except KeyError: + self.symbol_transform_counts[old_name, new_name] = 1 + + def summarize_symbol_transforms(self): + if self.symbol_transform_counts and Log().is_on(Log.NORMAL): + log = Log() + log.normal('Summary of symbol transforms:') + transforms = self.symbol_transform_counts.items() + transforms.sort() + for ((old_name, new_name), count) in transforms: + if new_name is None: + log.normal(' "%s" ignored in %d files' % (old_name, count,)) + else: + log.normal( + ' "%s" transformed to "%s" in %d files' + % (old_name, new_name, count,) + ) + + def _process_cvs_file_items(self, cvs_file_items): + """Process the CVSFileItems from one CVSFile.""" + + # Remove CVSRevisionDeletes that are not needed: + cvs_file_items.remove_unneeded_deletes(self.collect_data.metadata_db) + + # Remove initial branch deletes that are not needed: + cvs_file_items.remove_initial_branch_deletes( + self.collect_data.metadata_db + ) + + # If this is a --trunk-only conversion, discard all branches and + # tags, then draft any non-trunk default branch revisions to + # trunk: + if Ctx().trunk_only: + cvs_file_items.exclude_non_trunk() + + self.collect_data.revision_recorder.finish_file(cvs_file_items) + self.collect_data.add_cvs_file_items(cvs_file_items) + self.collect_data.symbol_stats.register(cvs_file_items) + + def process_file(self, cvs_file): + Log().normal(cvs_file.filename) + fdc = _FileDataCollector(self, cvs_file) + try: + cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc) + except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError): + self.collect_data.record_fatal_error( + "%r is not a valid ,v file" % (cvs_file.filename,) + ) + # Abort the processing of this file, but let the pass continue + # with other files: + return + except: + Log().warn("Exception occurred while parsing %s" % cvs_file.filename) + raise + else: + self.num_files += 1 + + cvs_file_items = fdc.get_cvs_file_items() + + del fdc + + self._process_cvs_file_items(cvs_file_items) + + +class CollectData: + """Repository for data collected by parsing the CVS repository files. + + This class manages the databases into which information collected + from the CVS repository is stored. The data are stored into this + class by _FileDataCollector instances, one of which is created for + each file to be parsed.""" + + def __init__(self, revision_recorder, stats_keeper): + self.revision_recorder = revision_recorder + self._cvs_item_store = NewCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_STORE)) + self.metadata_db = MetadataDatabase( + artifact_manager.get_temp_file(config.METADATA_STORE), + artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE), + DB_OPEN_NEW, + ) + self.metadata_logger = MetadataLogger(self.metadata_db) + self.fatal_errors = [] + self.num_files = 0 + self.symbol_stats = SymbolStatisticsCollector() + self.stats_keeper = stats_keeper + + # Key generator for CVSFiles: + self.file_key_generator = KeyGenerator() + + # Key generator for CVSItems: + self.item_key_generator = KeyGenerator() + + # Key generator for Symbols: + self.symbol_key_generator = KeyGenerator() + + self.revision_recorder.start() + + def record_fatal_error(self, err): + """Record that fatal error ERR was found. + + ERR is a string (without trailing newline) describing the error. + Output the error to stderr immediately, and record a copy to be + output again in a summary at the end of CollectRevsPass.""" + + err = '%s: %s' % (error_prefix, err,) + Log().error(err + '\n') + self.fatal_errors.append(err) + + def add_cvs_directory(self, cvs_directory): + """Record CVS_DIRECTORY.""" + + Ctx()._cvs_file_db.log_file(cvs_directory) + + def add_cvs_file_items(self, cvs_file_items): + """Record the information from CVS_FILE_ITEMS. + + Store the CVSFile to _cvs_file_db under its persistent id, store + the CVSItems, and record the CVSItems to self.stats_keeper.""" + + Ctx()._cvs_file_db.log_file(cvs_file_items.cvs_file) + self._cvs_item_store.add(cvs_file_items) + + self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file) + for cvs_item in cvs_file_items.values(): + self.stats_keeper.record_cvs_item(cvs_item) + + def _get_cvs_file( + self, parent_directory, basename, file_in_attic, leave_in_attic=False + ): + """Return a CVSFile describing the file with name BASENAME. + + PARENT_DIRECTORY is the CVSDirectory instance describing the + directory that physically holds this file in the filesystem. + BASENAME must be the base name of a *,v file within + PARENT_DIRECTORY. + + FILE_IN_ATTIC is a boolean telling whether the specified file is + in an Attic subdirectory. If FILE_IN_ATTIC is True, then: + + - If LEAVE_IN_ATTIC is True, then leave the 'Attic' component in + the filename. + + - Otherwise, raise FileInAndOutOfAtticException if a file with the + same filename appears outside of Attic. + + The CVSFile is assigned a new unique id. All of the CVSFile + information is filled in except mode (which can only be determined + by parsing the file). + + Raise FatalError if the resulting filename would not be legal in + SVN.""" + + filename = os.path.join(parent_directory.filename, basename) + try: + verify_svn_filename_legal(basename[:-2]) + except IllegalSVNPathError, e: + raise FatalError( + 'File %r would result in an illegal SVN filename: %s' + % (filename, e,) + ) + + if file_in_attic and not leave_in_attic: + in_attic = True + logical_parent_directory = parent_directory.parent_directory + + # If this file also exists outside of the attic, it's a fatal + # error: + non_attic_filename = os.path.join( + logical_parent_directory.filename, basename, + ) + if os.path.exists(non_attic_filename): + raise FileInAndOutOfAtticException(non_attic_filename, filename) + else: + in_attic = False + logical_parent_directory = parent_directory + + file_stat = os.stat(filename) + + # The size of the file in bytes: + file_size = file_stat[stat.ST_SIZE] + + # Whether or not the executable bit is set: + file_executable = bool(file_stat[0] & stat.S_IXUSR) + + # mode is not known, so we temporarily set it to None. + return CVSFile( + self.file_key_generator.gen_id(), + parent_directory.project, logical_parent_directory, basename[:-2], + in_attic, file_executable, file_size, None + ) + + def _get_attic_file(self, parent_directory, basename): + """Return a CVSFile object for the Attic file at BASENAME. + + PARENT_DIRECTORY is the CVSDirectory that physically contains the + file on the filesystem (i.e., the Attic directory). It is not + necessarily the parent_directory of the CVSFile that will be + returned. + + Return CVSFile, whose parent directory is usually + PARENT_DIRECTORY.parent_directory, but might be PARENT_DIRECTORY + iff CVSFile will remain in the Attic directory.""" + + try: + return self._get_cvs_file(parent_directory, basename, True) + except FileInAndOutOfAtticException, e: + if Ctx().retain_conflicting_attic_files: + Log().warn( + "%s: %s;\n" + " storing the latter into 'Attic' subdirectory.\n" + % (warning_prefix, e) + ) + else: + self.record_fatal_error(str(e)) + + # Either way, return a CVSFile object so that the rest of the + # file processing can proceed: + return self._get_cvs_file( + parent_directory, basename, True, leave_in_attic=True + ) + + def _generate_attic_cvs_files(self, cvs_directory): + """Generate CVSFiles for the files in Attic directory CVS_DIRECTORY. + + Also add CVS_DIRECTORY to self if any files are being retained in + that directory.""" + + retained_attic_file = False + + fnames = os.listdir(cvs_directory.filename) + fnames.sort() + for fname in fnames: + pathname = os.path.join(cvs_directory.filename, fname) + if os.path.isdir(pathname): + Log().warn("Directory %s found within Attic; ignoring" % (pathname,)) + elif fname.endswith(',v'): + cvs_file = self._get_attic_file(cvs_directory, fname) + if cvs_file.parent_directory == cvs_directory: + # This file will be retained in the Attic directory. + retained_attic_file = True + yield cvs_file + + if retained_attic_file: + # If any files were retained in the Attic directory, then write + # the Attic directory to CVSFileDatabase: + self.add_cvs_directory(cvs_directory) + + def _get_non_attic_file(self, parent_directory, basename): + """Return a CVSFile object for the non-Attic file at BASENAME.""" + + return self._get_cvs_file(parent_directory, basename, False) + + def _generate_cvs_files(self, cvs_directory): + """Generate the CVSFiles under non-Attic directory CVS_DIRECTORY. + + Process directories recursively, including Attic directories. + Also create and register CVSDirectories as they are found, and + look for conflicts between the filenames that will result from + files, attic files, and subdirectories.""" + + self.add_cvs_directory(cvs_directory) + + # Map {cvs_file.basename : cvs_file.filename} for files directly + # in cvs_directory: + rcsfiles = {} + + attic_dir = None + + # Non-Attic subdirectories of cvs_directory (to be recursed into): + dirs = [] + + fnames = os.listdir(cvs_directory.filename) + fnames.sort() + for fname in fnames: + pathname = os.path.join(cvs_directory.filename, fname) + if os.path.isdir(pathname): + if fname == 'Attic': + attic_dir = fname + else: + dirs.append(fname) + elif fname.endswith(',v'): + cvs_file = self._get_non_attic_file(cvs_directory, fname) + rcsfiles[cvs_file.basename] = cvs_file.filename + yield cvs_file + else: + # Silently ignore other files: + pass + + # Map {cvs_file.basename : cvs_file.filename} for files in an + # Attic directory within cvs_directory: + attic_rcsfiles = {} + + if attic_dir is not None: + attic_directory = CVSDirectory( + self.file_key_generator.gen_id(), + cvs_directory.project, cvs_directory, 'Attic', + ) + + for cvs_file in self._generate_attic_cvs_files(attic_directory): + if cvs_file.parent_directory == cvs_directory: + attic_rcsfiles[cvs_file.basename] = cvs_file.filename + yield cvs_file + + alldirs = dirs + [attic_dir] + else: + alldirs = dirs + + # Check for conflicts between directory names and the filenames + # that will result from the rcs files (both in this directory and + # in attic). (We recurse into the subdirectories nevertheless, to + # try to detect more problems.) + for fname in alldirs: + pathname = os.path.join(cvs_directory.filename, fname) + for rcsfile_list in [rcsfiles, attic_rcsfiles]: + if fname in rcsfile_list: + self.record_fatal_error( + 'Directory name conflicts with filename. Please remove or ' + 'rename one\n' + 'of the following:\n' + ' "%s"\n' + ' "%s"' + % (pathname, rcsfile_list[fname],) + ) + + # Now recurse into the other subdirectories: + for fname in dirs: + dirname = os.path.join(cvs_directory.filename, fname) + + # Verify that the directory name does not contain any illegal + # characters: + try: + verify_svn_filename_legal(fname) + except IllegalSVNPathError, e: + raise FatalError( + 'Directory %r would result in an illegal SVN path name: %s' + % (dirname, e,) + ) + + sub_directory = CVSDirectory( + self.file_key_generator.gen_id(), + cvs_directory.project, cvs_directory, fname, + ) + + for cvs_file in self._generate_cvs_files(sub_directory): + yield cvs_file + + def process_project(self, project): + Ctx()._projects[project.id] = project + + root_cvs_directory = CVSDirectory( + self.file_key_generator.gen_id(), project, None, '' + ) + project.root_cvs_directory_id = root_cvs_directory.id + pdc = _ProjectDataCollector(self, project) + + found_rcs_file = False + for cvs_file in self._generate_cvs_files(root_cvs_directory): + pdc.process_file(cvs_file) + found_rcs_file = True + + if not found_rcs_file: + self.record_fatal_error( + 'No RCS files found under %r!\n' + 'Are you absolutely certain you are pointing cvs2svn\n' + 'at a CVS repository?\n' + % (project.project_cvs_repos_path,) + ) + + pdc.summarize_symbol_transforms() + + self.num_files += pdc.num_files + Log().verbose('Processed', self.num_files, 'files') + + def _set_cvs_path_ordinals(self): + cvs_files = list(Ctx()._cvs_file_db.itervalues()) + cvs_files.sort(CVSPath.slow_compare) + for (i, cvs_file) in enumerate(cvs_files): + cvs_file.ordinal = i + + def close(self): + """Close the data structures associated with this instance. + + Return a list of fatal errors encountered while processing input. + Each list entry is a string describing one fatal error.""" + + self.revision_recorder.finish() + self.symbol_stats.purge_ghost_symbols() + self.symbol_stats.close() + self.symbol_stats = None + self.metadata_logger = None + self.metadata_db.close() + self.metadata_db = None + self._cvs_item_store.close() + self._cvs_item_store = None + self._set_cvs_path_ordinals() + self.revision_recorder = None + retval = self.fatal_errors + self.fatal_errors = None + return retval + + diff --git a/cvs2svn_lib/common.py b/cvs2svn_lib/common.py new file mode 100644 index 0000000..8400907 --- /dev/null +++ b/cvs2svn_lib/common.py @@ -0,0 +1,409 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains common facilities used by cvs2svn.""" + + +import re +import time +import codecs + +from cvs2svn_lib.log import Log + + +# Always use these constants for opening databases. +DB_OPEN_READ = 'r' +DB_OPEN_WRITE = 'w' +DB_OPEN_NEW = 'n' + + +SVN_INVALID_REVNUM = -1 + + +# Warnings and errors start with these strings. They are typically +# followed by a colon and a space, as in "%s: " ==> "WARNING: ". +warning_prefix = "WARNING" +error_prefix = "ERROR" + + +class FatalException(Exception): + """Exception thrown on a non-recoverable error. + + If this exception is thrown by main(), it is caught by the global + layer of the program, its string representation is printed (followed + by a newline), and the program is ended with an exit code of 1.""" + + pass + + +class InternalError(Exception): + """Exception thrown in the case of a cvs2svn internal error (aka, bug).""" + + pass + + +class FatalError(FatalException): + """A FatalException that prepends error_prefix to the message.""" + + def __init__(self, msg): + """Use (error_prefix + ': ' + MSG) as the error message.""" + + FatalException.__init__(self, '%s: %s' % (error_prefix, msg,)) + + +class CommandError(FatalError): + """A FatalError caused by a failed command invocation. + + The error message includes the command name, exit code, and output.""" + + def __init__(self, command, exit_status, error_output=''): + self.command = command + self.exit_status = exit_status + self.error_output = error_output + if error_output.rstrip(): + FatalError.__init__( + self, + 'The command %r failed with exit status=%s\n' + 'and the following output:\n' + '%s' + % (self.command, self.exit_status, self.error_output.rstrip())) + else: + FatalError.__init__( + self, + 'The command %r failed with exit status=%s and no output' + % (self.command, self.exit_status)) + + +def path_join(*components): + """Join two or more pathname COMPONENTS, inserting '/' as needed. + Empty component are skipped.""" + + return '/'.join(filter(None, components)) + + +def path_split(path): + """Split the svn pathname PATH into a pair, (HEAD, TAIL). + + This is similar to os.path.split(), but always uses '/' as path + separator. PATH is an svn path, which should not start with a '/'. + HEAD is everything before the last slash, and TAIL is everything + after. If PATH ends in a slash, TAIL will be empty. If there is no + slash in PATH, HEAD will be empty. If PATH is empty, both HEAD and + TAIL are empty.""" + + pos = path.rfind('/') + if pos == -1: + return ('', path,) + else: + return (path[:pos], path[pos+1:],) + + +class IllegalSVNPathError(FatalException): + pass + + +# Control characters (characters not allowed in Subversion filenames): +ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]') + + +def verify_svn_filename_legal(filename): + """Verify that FILENAME is a legal filename. + + FILENAME is a path component of a CVS path. Check that it won't + choke SVN: + + - Check that it is not empty. + + - Check that it is not equal to '.' or '..'. + + - Check that the filename does not include any control characters. + + If any of these tests fail, raise an IllegalSVNPathError.""" + + if filename == '': + raise IllegalSVNPathError("Empty filename component.") + + if filename in ['.', '..']: + raise IllegalSVNPathError("Illegal filename component %r." % (filename,)) + + m = ctrl_characters_regexp.search(filename) + if m: + raise IllegalSVNPathError( + "Character %r in filename %r is not supported by Subversion." + % (m.group(), filename,) + ) + + +def verify_svn_path_legal(path): + """Verify that PATH is a legitimate SVN path. + + If not, raise an IllegalSVNPathError.""" + + if path.startswith('/'): + raise IllegalSVNPathError("Path %r must not start with '/'." % (path,)) + head = path + while head != '': + (head,tail) = path_split(head) + try: + verify_svn_filename_legal(tail) + except IllegalSVNPathError, e: + raise IllegalSVNPathError('Problem with path %r: %s' % (path, e,)) + + +def normalize_svn_path(path, allow_empty=False): + """Normalize an SVN path (e.g., one supplied by a user). + + 1. Strip leading, trailing, and duplicated '/'. + 2. If ALLOW_EMPTY is not set, verify that PATH is not empty. + + Return the normalized path. + + If the path is invalid, raise an IllegalSVNPathError.""" + + norm_path = path_join(*path.split('/')) + if not allow_empty and not norm_path: + raise IllegalSVNPathError("Path is empty") + return norm_path + + +class PathRepeatedException(Exception): + def __init__(self, path, count): + self.path = path + self.count = count + Exception.__init__( + self, 'Path %s is repeated %d times' % (self.path, self.count,) + ) + + +class PathsNestedException(Exception): + def __init__(self, nest, nestlings): + self.nest = nest + self.nestlings = nestlings + Exception.__init__( + self, + 'Path %s contains the following other paths: %s' + % (self.nest, ', '.join(self.nestlings),) + ) + + +class PathsNotDisjointException(FatalException): + """An exception that collects multiple other disjointness exceptions.""" + + def __init__(self, problems): + self.problems = problems + Exception.__init__( + self, + 'The following paths are not disjoint:\n' + ' %s\n' + % ('\n '.join([str(problem) for problem in self.problems]),) + ) + + +def verify_paths_disjoint(*paths): + """Verify that all of the paths in the argument list are disjoint. + + If any of the paths is nested in another one (i.e., in the sense + that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical, + raise a PathsNotDisjointException containing exceptions detailing + the individual problems.""" + + def split(path): + if not path: + return [] + else: + return path.split('/') + + def contains(split_path1, split_path2): + """Return True iff SPLIT_PATH1 contains SPLIT_PATH2.""" + + return ( + len(split_path1) < len(split_path2) + and split_path2[:len(split_path1)] == split_path1 + ) + + paths = [(split(path), path) for path in paths] + # If all overlapping elements are equal, a shorter list is + # considered "less than" a longer one. Therefore if any paths are + # nested, this sort will leave at least one such pair adjacent, in + # the order [nest,nestling]. + paths.sort() + + problems = [] + + # Create exceptions for any repeated paths, and delete the repeats + # from the paths array: + i = 0 + while i < len(paths): + split_path, path = paths[i] + j = i + 1 + while j < len(paths) and split_path == paths[j][0]: + j += 1 + if j - i > 1: + problems.append(PathRepeatedException(path, j - i)) + # Delete all but the first copy: + del paths[i + 1:j] + i += 1 + + # Create exceptions for paths nested in each other: + i = 0 + while i < len(paths): + split_path, path = paths[i] + j = i + 1 + while j < len(paths) and contains(split_path, paths[j][0]): + j += 1 + if j - i > 1: + problems.append(PathsNestedException( + path, [path2 for (split_path2, path2) in paths[i + 1:j]] + )) + i += 1 + + if problems: + raise PathsNotDisjointException(problems) + + +def format_date(date): + """Return an svn-compatible date string for DATE (seconds since epoch). + + A Subversion date looks like '2002-09-29T14:44:59.000000Z'.""" + + return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date)) + + +class CVSTextDecoder: + """Callable that decodes CVS strings into Unicode.""" + + def __init__(self, encodings, fallback_encoding=None): + """Create a CVSTextDecoder instance. + + ENCODINGS is a list containing the names of encodings that are + attempted to be used as source encodings in 'strict' mode. + + FALLBACK_ENCODING, if specified, is the name of an encoding that + should be used as a source encoding in lossy 'replace' mode if all + of ENCODINGS failed. + + Raise LookupError if any of the specified encodings is unknown.""" + + self.decoders = [ + (encoding, codecs.lookup(encoding)[1]) + for encoding in encodings] + + if fallback_encoding is None: + self.fallback_decoder = None + else: + self.fallback_decoder = ( + fallback_encoding, codecs.lookup(fallback_encoding)[1] + ) + + def add_encoding(self, encoding): + """Add an encoding to be tried in 'strict' mode. + + ENCODING is the name of an encoding. If it is unknown, raise a + LookupError.""" + + for (name, decoder) in self.decoders: + if name == encoding: + return + else: + self.decoders.append( (encoding, codecs.lookup(encoding)[1]) ) + + def set_fallback_encoding(self, encoding): + """Set the fallback encoding, to be tried in 'replace' mode. + + ENCODING is the name of an encoding. If it is unknown, raise a + LookupError.""" + + if encoding is None: + self.fallback_decoder = None + else: + self.fallback_decoder = (encoding, codecs.lookup(encoding)[1]) + + def __call__(self, s): + """Try to decode string S using our configured source encodings. + + Return the string as a Unicode string. If S is already a unicode + string, do nothing. + + Raise UnicodeError if the string cannot be decoded using any of + the source encodings and no fallback encoding was specified.""" + + if isinstance(s, unicode): + return s + for (name, decoder) in self.decoders: + try: + return decoder(s)[0] + except ValueError: + Log().verbose("Encoding '%s' failed for string %r" % (name, s)) + + if self.fallback_decoder is not None: + (name, decoder) = self.fallback_decoder + return decoder(s, 'replace')[0] + else: + raise UnicodeError + + +class Timestamper: + """Return monotonic timestamps derived from changeset timestamps.""" + + def __init__(self): + # The last timestamp that has been returned: + self.timestamp = 0.0 + + # The maximum timestamp that is considered reasonable: + self.max_timestamp = time.time() + 24.0 * 60.0 * 60.0 + + def get(self, timestamp, change_expected): + """Return a reasonable timestamp derived from TIMESTAMP. + + Push TIMESTAMP into the future if necessary to ensure that it is + at least one second later than every other timestamp that has been + returned by previous calls to this method. + + If CHANGE_EXPECTED is not True, then log a message if the + timestamp has to be changed.""" + + if timestamp > self.max_timestamp: + # If a timestamp is in the future, it is assumed that it is + # bogus. Shift it backwards in time to prevent it forcing other + # timestamps to be pushed even further in the future. + + # Note that this is not nearly a complete solution to the bogus + # timestamp problem. A timestamp in the future still affects + # the ordering of changesets, and a changeset having such a + # timestamp will not be committed until all changesets with + # earlier timestamps have been committed, even if other + # changesets with even earlier timestamps depend on this one. + self.timestamp = self.timestamp + 1.0 + if not change_expected: + Log().warn( + 'Timestamp "%s" is in the future; changed to "%s".' + % (time.asctime(time.gmtime(timestamp)), + time.asctime(time.gmtime(self.timestamp)),) + ) + elif timestamp < self.timestamp + 1.0: + self.timestamp = self.timestamp + 1.0 + if not change_expected and Log().is_on(Log.VERBOSE): + Log().verbose( + 'Timestamp "%s" adjusted to "%s" to ensure monotonicity.' + % (time.asctime(time.gmtime(timestamp)), + time.asctime(time.gmtime(self.timestamp)),) + ) + else: + self.timestamp = timestamp + + return self.timestamp + + diff --git a/cvs2svn_lib/config.py b/cvs2svn_lib/config.py new file mode 100644 index 0000000..b313b2c --- /dev/null +++ b/cvs2svn_lib/config.py @@ -0,0 +1,221 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains various configuration constants used by cvs2svn.""" + + +SVN_KEYWORDS_VALUE = 'Author Date Id Revision' + +# The default names for the trunk/branches/tags directory for each +# project: +DEFAULT_TRUNK_BASE = 'trunk' +DEFAULT_BRANCHES_BASE = 'branches' +DEFAULT_TAGS_BASE = 'tags' + +SVNADMIN_EXECUTABLE = 'svnadmin' +CO_EXECUTABLE = 'co' +CVS_EXECUTABLE = 'cvs' +SORT_EXECUTABLE = 'sort' + +# A pickled list of the projects defined for this conversion. +PROJECTS = 'projects.pck' + +# A file holding the Serializer to be used for +# CVS_REVS_SUMMARY_*_DATAFILE and CVS_SYMBOLS_SYMMARY_*_DATAFILE: +SUMMARY_SERIALIZER = 'summary-serializer.pck' + +# The first file contains enough information about each CVSRevision to +# deduce preliminary Changesets. The second file is a sorted version +# of the first. +CVS_REVS_SUMMARY_DATAFILE = 'revs-summary.txt' +CVS_REVS_SUMMARY_SORTED_DATAFILE = 'revs-summary-s.txt' + +# The first file contains enough information about each CVSSymbol to +# deduce preliminary Changesets. The second file is a sorted version +# of the first. +CVS_SYMBOLS_SUMMARY_DATAFILE = 'symbols-summary.txt' +CVS_SYMBOLS_SUMMARY_SORTED_DATAFILE = 'symbols-summary-s.txt' + +# A mapping from CVSItem id to Changeset id. +CVS_ITEM_TO_CHANGESET = 'cvs-item-to-changeset.dat' + +# A mapping from CVSItem id to Changeset id, after the +# RevisionChangeset loops have been broken. +CVS_ITEM_TO_CHANGESET_REVBROKEN = 'cvs-item-to-changeset-revbroken.dat' + +# A mapping from CVSItem id to Changeset id, after the SymbolChangeset +# loops have been broken. +CVS_ITEM_TO_CHANGESET_SYMBROKEN = 'cvs-item-to-changeset-symbroken.dat' + +# A mapping from CVSItem id to Changeset id, after all Changeset +# loops have been broken. +CVS_ITEM_TO_CHANGESET_ALLBROKEN = 'cvs-item-to-changeset-allbroken.dat' + +# A mapping from id to Changeset. +CHANGESETS_INDEX = 'changesets-index.dat' +CHANGESETS_STORE = 'changesets.pck' + +# A mapping from id to Changeset, after the RevisionChangeset loops +# have been broken. +CHANGESETS_REVBROKEN_INDEX = 'changesets-revbroken-index.dat' +CHANGESETS_REVBROKEN_STORE = 'changesets-revbroken.pck' + +# A mapping from id to Changeset, after the RevisionChangesets have +# been sorted and converted into OrderedChangesets. +CHANGESETS_REVSORTED_INDEX = 'changesets-revsorted-index.dat' +CHANGESETS_REVSORTED_STORE = 'changesets-revsorted.pck' + +# A mapping from id to Changeset, after the SymbolChangeset loops have +# been broken. +CHANGESETS_SYMBROKEN_INDEX = 'changesets-symbroken-index.dat' +CHANGESETS_SYMBROKEN_STORE = 'changesets-symbroken.pck' + +# A mapping from id to Changeset, after all Changeset loops have been +# broken. +CHANGESETS_ALLBROKEN_INDEX = 'changesets-allbroken-index.dat' +CHANGESETS_ALLBROKEN_STORE = 'changesets-allbroken.pck' + +# The RevisionChangesets in commit order. Each line contains the +# changeset id and timestamp of one changeset, in hexadecimal, in the +# order that the changesets should be committed to svn. +CHANGESETS_SORTED_DATAFILE = 'changesets-s.txt' + +# A file containing a marshalled copy of all the statistics that have +# been gathered so far is written at the end of each pass as a +# marshalled dictionary. This is the pattern used to generate the +# filenames. +STATISTICS_FILE = 'statistics-%02d.pck' + +# This text file contains records (1 per line) that describe openings +# and closings for copies to tags and branches. The format is as +# follows: +# +# SYMBOL_ID SVN_REVNUM TYPE CVS_SYMBOL_ID +# +# where type is either OPENING or CLOSING. CVS_SYMBOL_ID is the id of +# the CVSSymbol whose opening or closing is being described (in hex). +SYMBOL_OPENINGS_CLOSINGS = 'symbolic-names.txt' +# A sorted version of the above file. SYMBOL_ID and SVN_REVNUM are +# the primary and secondary sorting criteria. It is important that +# SYMBOL_IDs be located together to make it quick to read them at +# once. The order of SVN_REVNUM is only important because it is +# assumed by some internal consistency checks. +SYMBOL_OPENINGS_CLOSINGS_SORTED = 'symbolic-names-s.txt' + +# Skeleton version of the repository filesystem. See class +# RepositoryMirror for how these work. +MIRROR_NODES_INDEX_TABLE = 'mirror-nodes-index.dat' +MIRROR_NODES_STORE = 'mirror-nodes.pck' + +# Offsets pointing to the beginning of each symbol's records in +# SYMBOL_OPENINGS_CLOSINGS_SORTED. This file contains a pickled map +# from symbol_id to file offset. +SYMBOL_OFFSETS_DB = 'symbol-offsets.pck' + +# Pickled map of CVSFile.id to instance. +CVS_FILES_DB = 'cvs-files.pck' + +# A series of records. The first is a pickled serializer. Each +# subsequent record is a serialized list of all CVSItems applying to a +# CVSFile. +CVS_ITEMS_STORE = 'cvs-items.pck' + +# The same as above, but with the CVSItems ordered in groups based on +# their initial changesets. CVSItems will usually be accessed one +# changeset at a time, so this ordering helps disk locality (even +# though some of the changesets will later be broken up). +CVS_ITEMS_SORTED_INDEX_TABLE = 'cvs-items-sorted-index.dat' +CVS_ITEMS_SORTED_STORE = 'cvs-items-sorted.pck' + +# A record of all symbolic names that will be processed in the +# conversion. This file contains a pickled list of TypedSymbol +# objects. +SYMBOL_DB = 'symbols.pck' + +# A pickled list of the statistics for all symbols. Each entry in the +# list is an instance of cvs2svn_lib.symbol_statistics._Stats. +SYMBOL_STATISTICS = 'symbol-statistics.pck' + +# These two databases provide a bidirectional mapping between +# CVSRevision.ids (in hex) and Subversion revision numbers. +# +# The first maps CVSRevision.id to the SVN revision number of which it +# is a part (more than one CVSRevision can map to the same SVN +# revision number). +# +# The second maps Subversion revision numbers (as hex strings) to +# pickled SVNCommit instances. +CVS_REVS_TO_SVN_REVNUMS = 'cvs-revs-to-svn-revnums.dat' + +# This database maps Subversion revision numbers to pickled SVNCommit +# instances. +SVN_COMMITS_INDEX_TABLE = 'svn-commits-index.dat' +SVN_COMMITS_STORE = 'svn-commits.pck' + +# How many bytes to read at a time from a pipe. 128 kiB should be +# large enough to be efficient without wasting too much memory. +PIPE_READ_SIZE = 128 * 1024 + +# Records the author and log message for each changeset. The database +# contains a map metadata_id -> (author, logmessage). Each +# CVSRevision that is eligible to be combined into the same SVN commit +# is assigned the same id. Note that the (author, logmessage) pairs +# are not necessarily all distinct; other data are taken into account +# when constructing ids. +METADATA_INDEX_TABLE = 'metadata-index.dat' +METADATA_STORE = 'metadata.pck' + +# The same, after it has been cleaned up for the chosen output option: +METADATA_CLEAN_INDEX_TABLE = 'metadata-clean-index.dat' +METADATA_CLEAN_STORE = 'metadata-clean.pck' + +# The following four databases are used in conjunction with --use-internal-co. + +# Records the RCS deltas for all CVS revisions. The deltas are to be +# applied forward, i.e. those from trunk are reversed wrt RCS. +RCS_DELTAS_INDEX_TABLE = 'rcs-deltas-index.dat' +RCS_DELTAS_STORE = 'rcs-deltas.pck' + +# Records the revision tree of each RCS file. The format is a list of +# list of integers. The outer list holds lines of development, the inner list +# revisions within the LODs, revisions are CVSItem ids. Branches "closer +# to the trunk" appear later. Revisions are sorted by reverse chronological +# order. The last revision of each branch is the revision it sprouts from. +# Revisions that represent deletions at the end of a branch are omitted. +RCS_TREES_INDEX_TABLE = 'rcs-trees-index.dat' +RCS_TREES_STORE = 'rcs-trees.pck' + +# Records the revision tree of each RCS file after removing revisions +# belonging to excluded branches. Note that the branch ordering is arbitrary +# in this file. +RCS_TREES_FILTERED_INDEX_TABLE = 'rcs-trees-filtered-index.dat' +RCS_TREES_FILTERED_STORE = 'rcs-trees-filtered.pck' + +# At any given time during OutputPass, holds the full text of each CVS +# revision that was checked out already and still has descendants that will +# be checked out. +CVS_CHECKOUT_DB = 'cvs-checkout.db' + +# End of DBs related to --use-internal-co. + +# If this run will output directly to a Subversion repository, then +# this is the name of the file that each revision will temporarily be +# written to prior to writing it into the repository. +DUMPFILE = 'svn.dump' + +# flush a commit if a 5 minute gap occurs. +COMMIT_THRESHOLD = 5 * 60 + diff --git a/cvs2svn_lib/context.py b/cvs2svn_lib/context.py new file mode 100644 index 0000000..89dc16a --- /dev/null +++ b/cvs2svn_lib/context.py @@ -0,0 +1,93 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Store the context (options, etc) for a cvs2svn run.""" + + +import os + +from cvs2svn_lib import config +from cvs2svn_lib.common import CVSTextDecoder + + +class Ctx: + """Session state for this run of cvs2svn. For example, run-time + options are stored here. This class is a Borg (see + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531).""" + + __shared_state = { } + + def __init__(self): + self.__dict__ = self.__shared_state + if self.__dict__: + return + # Else, initialize to defaults. + self.set_defaults() + + def set_defaults(self): + """Set all parameters to their default values.""" + + self.output_option = None + self.dry_run = False + self.revision_recorder = None + self.revision_excluder = None + self.revision_reader = None + self.svnadmin_executable = config.SVNADMIN_EXECUTABLE + self.sort_executable = config.SORT_EXECUTABLE + self.trunk_only = False + self.prune = True + self.cvs_author_decoder = CVSTextDecoder(['ascii']) + self.cvs_log_decoder = CVSTextDecoder(['ascii']) + self.cvs_filename_decoder = CVSTextDecoder(['ascii']) + self.decode_apple_single = False + self.symbol_info_filename = None + self.username = None + self.svn_property_setters = [] + self.tmpdir = 'cvs2svn-tmp' + self.skip_cleanup = False + self.keep_cvsignore = False + self.cross_project_commits = True + self.cross_branch_commits = True + self.retain_conflicting_attic_files = False + + self.initial_project_commit_message = ( + 'Standard project directories initialized by cvs2svn.' + ) + self.post_commit_message = ( + 'This commit was generated by cvs2svn to compensate for ' + 'changes in r%(revnum)d, which included commits to RCS files ' + 'with non-trunk default branches.' + ) + self.symbol_commit_message = ( + "This commit was manufactured by cvs2svn to create %(symbol_type)s " + "'%(symbol_name)s'." + ) + + + def get_temp_filename(self, basename): + return os.path.join(self.tmpdir, basename) + + def clean(self): + """Dispose of items in our dictionary that are not intended to + live past the end of a pass (identified by exactly one leading + underscore).""" + + for attr in self.__dict__.keys(): + if (attr.startswith('_') and not attr.startswith('__') + and not attr.startswith('_Ctx__')): + delattr(self, attr) + + diff --git a/cvs2svn_lib/cvs_file.py b/cvs2svn_lib/cvs_file.py new file mode 100644 index 0000000..3a1bb4f --- /dev/null +++ b/cvs2svn_lib/cvs_file.py @@ -0,0 +1,287 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains a class to store information about a CVS file.""" + +import os + +from cvs2svn_lib.common import path_join +from cvs2svn_lib.context import Ctx + + +class CVSPath(object): + """Represent a CVS file or directory. + + Members: + + id -- (int) unique ID for this CVSPath. At any moment, there is + at most one CVSPath instance with a particular ID. (This + means that object identity is the same as object equality, and + objects can be used as map keys even though they don't have a + __hash__() method). + + project -- (Project) the project containing this CVSPath. + + parent_directory -- (CVSDirectory or None) the CVSDirectory + containing this CVSPath. + + basename -- (string) the base name of this CVSPath (no ',v'). The + basename of the root directory of a project is ''. + + ordinal -- (int) the order that this instance should be sorted + relative to other CVSPath instances. This member is set based + on the ordering imposed by slow_compare() by CollectData after + all CVSFiles have been processed. Comparisons of CVSPath + using __cmp__() simply compare the ordinals. + + """ + + __slots__ = [ + 'id', + 'project', + 'parent_directory', + 'basename', + 'ordinal', + ] + + def __init__(self, id, project, parent_directory, basename): + self.id = id + self.project = project + self.parent_directory = parent_directory + self.basename = basename + + def __getstate__(self): + """This method must only be called after ordinal has been set.""" + + return ( + self.id, self.project.id, + self.parent_directory, self.basename, + self.ordinal, + ) + + def __setstate__(self, state): + ( + self.id, project_id, + self.parent_directory, self.basename, + self.ordinal, + ) = state + self.project = Ctx()._projects[project_id] + + def get_ancestry(self): + """Return a list of the CVSPaths leading from the root path to SELF. + + Return the CVSPaths in a list, starting with + self.project.get_root_cvs_directory() and ending with self.""" + + ancestry = [] + p = self + while p is not None: + ancestry.append(p) + p = p.parent_directory + + ancestry.reverse() + return ancestry + + def get_cvs_path(self): + """Return the canonical path within the Project. + + The canonical path: + + - Uses forward slashes + + - Doesn't include ',v' for files + + - This doesn't include the 'Attic' segment of the path unless the + file is to be left in an Attic directory in the SVN repository; + i.e., if a filename exists in and out of Attic and the + --retain-conflicting-attic-files option was specified. + + """ + + return path_join(*[p.basename for p in self.get_ancestry()[1:]]) + + cvs_path = property(get_cvs_path) + + def _get_dir_components(self): + """Return a list containing the components of the path leading to SELF. + + The return value contains the base names of all of the parent + directories (except for the root directory) and SELF.""" + + return [p.basename for p in self.get_ancestry()[1:]] + + def __eq__(a, b): + """Compare two CVSPath instances for equality. + + This method is supplied to avoid using __cmp__() for comparing for + equality.""" + + return a is b + + def slow_compare(a, b): + return ( + # Sort first by project: + cmp(a.project, b.project) + # Then by directory components: + or cmp(a._get_dir_components(), b._get_dir_components()) + ) + + def __cmp__(a, b): + """This method must only be called after ordinal has been set.""" + + return cmp(a.ordinal, b.ordinal) + + +class CVSDirectory(CVSPath): + """Represent a CVS directory. + + Members: + + id -- (int or None) unique id for this file. If None, a new id is + generated. + + project -- (Project) the project containing this file. + + parent_directory -- (CVSDirectory or None) the CVSDirectory + containing this CVSDirectory. + + basename -- (string) the base name of this CVSDirectory (no ',v'). + + """ + + __slots__ = [] + + def __init__(self, id, project, parent_directory, basename): + """Initialize a new CVSDirectory object.""" + + CVSPath.__init__(self, id, project, parent_directory, basename) + + def get_filename(self): + """Return the filesystem path to this CVSPath in the CVS repository.""" + + if self.parent_directory is None: + return self.project.project_cvs_repos_path + else: + return os.path.join( + self.parent_directory.get_filename(), self.basename + ) + + filename = property(get_filename) + + def __getstate__(self): + return CVSPath.__getstate__(self) + + def __setstate__(self, state): + CVSPath.__setstate__(self, state) + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return self.cvs_path + '/' + + def __repr__(self): + return 'CVSDirectory<%x>(%r)' % (self.id, str(self),) + + +class CVSFile(CVSPath): + """Represent a CVS file. + + Members: + + id -- (int) unique id for this file. + + project -- (Project) the project containing this file. + + parent_directory -- (CVSDirectory) the CVSDirectory containing + this CVSFile. + + basename -- (string) the base name of this CVSFile (no ',v'). + + _in_attic -- (bool) True if RCS file is in an Attic subdirectory + that is not considered the parent directory. (If a file is + in-and-out-of-attic and one copy is to be left in Attic after + the conversion, then the Attic directory is that file's + PARENT_DIRECTORY and _IN_ATTIC is False.) + + executable -- (bool) True iff RCS file has executable bit set. + + file_size -- (long) size of the RCS file in bytes. + + mode -- (string or None) 'kkv', 'kb', etc. + + PARENT_DIRECTORY might contain an 'Attic' component if it should be + retained in the SVN repository; i.e., if the same filename exists out + of Attic and the --retain-conflicting-attic-files option was specified. + + """ + + __slots__ = [ + '_in_attic', + 'executable', + 'file_size', + 'mode', + ] + + def __init__( + self, id, project, parent_directory, basename, in_attic, + executable, file_size, mode + ): + """Initialize a new CVSFile object.""" + + CVSPath.__init__(self, id, project, parent_directory, basename) + self._in_attic = in_attic + self.executable = executable + self.file_size = file_size + self.mode = mode + + assert self.parent_directory is not None + + def get_filename(self): + """Return the filesystem path to this CVSPath in the CVS repository.""" + + if self._in_attic: + return os.path.join( + self.parent_directory.filename, 'Attic', self.basename + ',v' + ) + else: + return os.path.join( + self.parent_directory.filename, self.basename + ',v' + ) + + filename = property(get_filename) + + def __getstate__(self): + return ( + CVSPath.__getstate__(self), + self._in_attic, self.executable, self.file_size, self.mode, + ) + + def __setstate__(self, state): + ( + cvs_path_state, + self._in_attic, self.executable, self.file_size, self.mode, + ) = state + CVSPath.__setstate__(self, cvs_path_state) + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return self.cvs_path + + def __repr__(self): + return 'CVSFile<%x>(%r)' % (self.id, str(self),) + + diff --git a/cvs2svn_lib/cvs_file_database.py b/cvs2svn_lib/cvs_file_database.py new file mode 100644 index 0000000..61eebf3 --- /dev/null +++ b/cvs2svn_lib/cvs_file_database.py @@ -0,0 +1,75 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains database facilities used by cvs2svn.""" + + +import cPickle + +from cvs2svn_lib import config +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.artifact_manager import artifact_manager + + +class CVSFileDatabase: + """A database to store CVSFile objects and retrieve them by their id.""" + + def __init__(self, mode): + """Initialize an instance, opening database in MODE (where MODE is + either DB_OPEN_NEW or DB_OPEN_READ).""" + + self.mode = mode + + # A map { id : CVSFile } + self._cvs_files = {} + + if self.mode == DB_OPEN_NEW: + pass + elif self.mode == DB_OPEN_READ: + f = open(artifact_manager.get_temp_file(config.CVS_FILES_DB), 'rb') + cvs_files = cPickle.load(f) + for cvs_file in cvs_files: + self._cvs_files[cvs_file.id] = cvs_file + else: + raise RuntimeError('Invalid mode %r' % self.mode) + + def log_file(self, cvs_file): + """Add CVS_FILE, a CVSFile instance, to the database.""" + + if self.mode == DB_OPEN_READ: + raise RuntimeError('Cannot write items in mode %r' % self.mode) + + self._cvs_files[cvs_file.id] = cvs_file + + def itervalues(self): + for value in self._cvs_files.itervalues(): + yield value + + def get_file(self, id): + """Return the CVSFile with the specified ID.""" + + return self._cvs_files[id] + + def close(self): + if self.mode == DB_OPEN_NEW: + f = open(artifact_manager.get_temp_file(config.CVS_FILES_DB), 'wb') + cPickle.dump(self._cvs_files.values(), f, -1) + f.close() + + self._cvs_files = None + + diff --git a/cvs2svn_lib/cvs_file_items.py b/cvs2svn_lib/cvs_file_items.py new file mode 100644 index 0000000..f0dc782 --- /dev/null +++ b/cvs2svn_lib/cvs_file_items.py @@ -0,0 +1,1075 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains a class to manage the CVSItems related to one file.""" + + +import re + +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.log import Log +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag +from cvs2svn_lib.symbol import ExcludedSymbol +from cvs2svn_lib.cvs_item import CVSRevision +from cvs2svn_lib.cvs_item import CVSRevisionModification +from cvs2svn_lib.cvs_item import CVSRevisionAbsent +from cvs2svn_lib.cvs_item import CVSRevisionNoop +from cvs2svn_lib.cvs_item import CVSSymbol +from cvs2svn_lib.cvs_item import CVSBranch +from cvs2svn_lib.cvs_item import CVSTag +from cvs2svn_lib.cvs_item import cvs_revision_type_map +from cvs2svn_lib.cvs_item import cvs_branch_type_map +from cvs2svn_lib.cvs_item import cvs_tag_type_map + + +class VendorBranchError(Exception): + """There is an error in the structure of the file revision tree.""" + + pass + + +class LODItems(object): + def __init__(self, lod, cvs_branch, cvs_revisions, cvs_branches, cvs_tags): + # The LineOfDevelopment described by this instance. + self.lod = lod + + # The CVSBranch starting this LOD, if any; otherwise, None. + self.cvs_branch = cvs_branch + + # The list of CVSRevisions on this LOD, if any. The CVSRevisions + # are listed in dependency order. + self.cvs_revisions = cvs_revisions + + # A list of CVSBranches that sprout from this LOD (either from + # cvs_branch or from one of the CVSRevisions). + self.cvs_branches = cvs_branches + + # A list of CVSTags that sprout from this LOD (either from + # cvs_branch or from one of the CVSRevisions). + self.cvs_tags = cvs_tags + + def is_trivial_import(self): + """Return True iff this LOD is a trivial import branch in this file. + + A trivial import branch is a branch that was used for a single + import and nothing else. Such a branch is eligible for being + grafted onto trunk, even if it has branch blockers.""" + + return ( + len(self.cvs_revisions) == 1 + and self.cvs_revisions[0].ntdbr + ) + + def is_pure_ntdb(self): + """Return True iff this LOD is a pure NTDB in this file. + + A pure non-trunk default branch is defined to be a branch that + contains only NTDB revisions (and at least one of them). Such a + branch is eligible for being grafted onto trunk, even if it has + branch blockers.""" + + return ( + self.cvs_revisions + and self.cvs_revisions[-1].ntdbr + ) + + def iter_blockers(self): + if self.is_pure_ntdb(): + # Such a branch has no blockers, because the blockers can be + # grafted to trunk. + pass + else: + # Other branches are only blocked by symbols that sprout from + # non-NTDB revisions: + non_ntdbr_revision_ids = set() + for cvs_revision in self.cvs_revisions: + if not cvs_revision.ntdbr: + non_ntdbr_revision_ids.add(cvs_revision.id) + + for cvs_tag in self.cvs_tags: + if cvs_tag.source_id in non_ntdbr_revision_ids: + yield cvs_tag + + for cvs_branch in self.cvs_branches: + if cvs_branch.source_id in non_ntdbr_revision_ids: + yield cvs_branch + + +class CVSFileItems(object): + def __init__(self, cvs_file, trunk, cvs_items): + # The file whose data this instance holds. + self.cvs_file = cvs_file + + # The symbol that represents "Trunk" in this file. + self.trunk = trunk + + # A map from CVSItem.id to CVSItem: + self._cvs_items = {} + + # The cvs_item_id of each root in the CVSItem forest. (A root is + # defined to be any CVSRevision with no prev_id.) + self.root_ids = set() + + for cvs_item in cvs_items: + self.add(cvs_item) + if isinstance(cvs_item, CVSRevision) and cvs_item.prev_id is None: + self.root_ids.add(cvs_item.id) + + def __getstate__(self): + return (self.cvs_file.id, self.values(),) + + def __setstate__(self, state): + (cvs_file_id, cvs_items,) = state + cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id) + CVSFileItems.__init__( + self, cvs_file, cvs_file.project.get_trunk(), cvs_items, + ) + + def add(self, cvs_item): + self._cvs_items[cvs_item.id] = cvs_item + + def __getitem__(self, id): + """Return the CVSItem with the specified ID.""" + + return self._cvs_items[id] + + def get(self, id, default=None): + return self._cvs_items.get(id, default) + + def __delitem__(self, id): + assert id not in self.root_ids + del self._cvs_items[id] + + def values(self): + return self._cvs_items.values() + + def check_link_consistency(self): + """Check that the CVSItems are linked correctly with each other.""" + + for cvs_item in self.values(): + try: + cvs_item.check_links(self) + except AssertionError: + Log().error( + 'Link consistency error in %s\n' + 'This is probably a bug internal to cvs2svn. Please file a bug\n' + 'report including the following stack trace (see FAQ for more ' + 'info).' + % (cvs_item,)) + raise + + def _get_lod(self, lod, cvs_branch, start_id): + """Return the indicated LODItems. + + LOD is the corresponding LineOfDevelopment. CVS_BRANCH is the + CVSBranch instance that starts the LOD if any; otherwise it is + None. START_ID is the id of the first CVSRevision on this LOD, or + None if there are none.""" + + cvs_revisions = [] + cvs_branches = [] + cvs_tags = [] + + def process_subitems(cvs_item): + """Process the branches and tags that are rooted in CVS_ITEM. + + CVS_ITEM can be a CVSRevision or a CVSBranch.""" + + for branch_id in cvs_item.branch_ids[:]: + cvs_branches.append(self[branch_id]) + + for tag_id in cvs_item.tag_ids: + cvs_tags.append(self[tag_id]) + + if cvs_branch is not None: + # Include the symbols sprouting directly from the CVSBranch: + process_subitems(cvs_branch) + + id = start_id + while id is not None: + cvs_rev = self[id] + cvs_revisions.append(cvs_rev) + process_subitems(cvs_rev) + id = cvs_rev.next_id + + return LODItems(lod, cvs_branch, cvs_revisions, cvs_branches, cvs_tags) + + def get_lod_items(self, cvs_branch): + """Return an LODItems describing the branch that starts at CVS_BRANCH. + + CVS_BRANCH must be an instance of CVSBranch contained in this + CVSFileItems.""" + + return self._get_lod(cvs_branch.symbol, cvs_branch, cvs_branch.next_id) + + def iter_root_lods(self): + """Iterate over the LODItems for all root LODs (non-recursively).""" + + for id in list(self.root_ids): + cvs_item = self[id] + if isinstance(cvs_item, CVSRevision): + # This LOD doesn't have a CVSBranch associated with it. + # Either it is Trunk, or it is a branch whose CVSBranch has + # been deleted. + yield self._get_lod(cvs_item.lod, None, id) + elif isinstance(cvs_item, CVSBranch): + # This is a Branch that has been severed from the rest of the + # tree. + yield self._get_lod(cvs_item.symbol, cvs_item, cvs_item.next_id) + else: + raise InternalError('Unexpected root item: %s' % (cvs_item,)) + + def _iter_tree(self, lod, cvs_branch, start_id): + """Iterate over the tree that starts at the specified line of development. + + LOD is the LineOfDevelopment where the iteration should start. + CVS_BRANCH is the CVSBranch instance that starts the LOD if any; + otherwise it is None. ID is the id of the first CVSRevision on + this LOD, or None if there are none. + + There are two cases handled by this routine: trunk (where LOD is a + Trunk instance, CVS_BRANCH is None, and ID is the id of the 1.1 + revision) and a branch (where LOD is a Branch instance, CVS_BRANCH + is a CVSBranch instance, and ID is either the id of the first + CVSRevision on the branch or None if there are no CVSRevisions on + the branch). Note that CVS_BRANCH and ID cannot simultaneously be + None. + + Yield an LODItems instance for each line of development.""" + + cvs_revisions = [] + cvs_branches = [] + cvs_tags = [] + + def process_subitems(cvs_item): + """Process the branches and tags that are rooted in CVS_ITEM. + + CVS_ITEM can be a CVSRevision or a CVSBranch.""" + + for branch_id in cvs_item.branch_ids[:]: + # Recurse into the branch: + branch = self[branch_id] + for lod_items in self._iter_tree( + branch.symbol, branch, branch.next_id + ): + yield lod_items + # The caller might have deleted the branch that we just + # yielded. If it is no longer present, then do not add it to + # the list of cvs_branches. + try: + cvs_branches.append(self[branch_id]) + except KeyError: + pass + + for tag_id in cvs_item.tag_ids: + cvs_tags.append(self[tag_id]) + + if cvs_branch is not None: + # Include the symbols sprouting directly from the CVSBranch: + for lod_items in process_subitems(cvs_branch): + yield lod_items + + id = start_id + while id is not None: + cvs_rev = self[id] + cvs_revisions.append(cvs_rev) + + for lod_items in process_subitems(cvs_rev): + yield lod_items + + id = cvs_rev.next_id + + yield LODItems(lod, cvs_branch, cvs_revisions, cvs_branches, cvs_tags) + + def iter_lods(self): + """Iterate over LinesOfDevelopment in this file, in depth-first order. + + For each LOD, yield an LODItems instance. The traversal starts at + each root node but returns the LODs in depth-first order. + + It is allowed to modify the CVSFileItems instance while the + traversal is occurring, but only in ways that don't affect the + tree structure above (i.e., towards the trunk from) the current + LOD.""" + + # Make a list out of root_ids so that callers can change it: + for id in list(self.root_ids): + cvs_item = self[id] + if isinstance(cvs_item, CVSRevision): + # This LOD doesn't have a CVSBranch associated with it. + # Either it is Trunk, or it is a branch whose CVSBranch has + # been deleted. + lod = cvs_item.lod + cvs_branch = None + elif isinstance(cvs_item, CVSBranch): + # This is a Branch that has been severed from the rest of the + # tree. + lod = cvs_item.symbol + id = cvs_item.next_id + cvs_branch = cvs_item + else: + raise InternalError('Unexpected root item: %s' % (cvs_item,)) + + for lod_items in self._iter_tree(lod, cvs_branch, id): + yield lod_items + + def iter_deltatext_ancestors(self, cvs_rev): + """Generate the delta-dependency ancestors of CVS_REV. + + Generate then ancestors of CVS_REV in deltatext order; i.e., back + along branches towards trunk, then outwards along trunk towards + HEAD.""" + + while True: + # Determine the next candidate source revision: + if isinstance(cvs_rev.lod, Trunk): + if cvs_rev.next_id is None: + # HEAD has no ancestors, so we are done: + return + else: + cvs_rev = self[cvs_rev.next_id] + else: + cvs_rev = self[cvs_rev.prev_id] + + yield cvs_rev + + def _sever_branch(self, lod_items): + """Sever the branch from its source and discard the CVSBranch. + + LOD_ITEMS describes a branch that should be severed from its + source, deleting the CVSBranch and creating a new root. Also set + LOD_ITEMS.cvs_branch to none. + + This method can only be used before symbols have been grafted onto + CVSBranches. It does not adjust NTDBR, NTDBR_PREV_ID or + NTDBR_NEXT_ID even if LOD_ITEMS describes a NTDB.""" + + cvs_branch = lod_items.cvs_branch + assert cvs_branch is not None + assert not cvs_branch.tag_ids + assert not cvs_branch.branch_ids + source_rev = self[cvs_branch.source_id] + + # We only cover the following case, even though after + # FilterSymbolsPass cvs_branch.source_id might refer to another + # CVSBranch. + assert isinstance(source_rev, CVSRevision) + + # Delete the CVSBranch itself: + lod_items.cvs_branch = None + del self[cvs_branch.id] + + # Delete the reference from the source revision to the CVSBranch: + source_rev.branch_ids.remove(cvs_branch.id) + + # Delete the reference from the first revision on the branch to + # the CVSBranch: + if lod_items.cvs_revisions: + first_rev = lod_items.cvs_revisions[0] + + # Delete the reference from first_rev to the CVSBranch: + first_rev.first_on_branch_id = None + + # Delete the reference from the source revision to the first + # revision on the branch: + source_rev.branch_commit_ids.remove(first_rev.id) + + # ...and vice versa: + first_rev.prev_id = None + + # Change the type of first_rev (e.g., from Change to Add): + first_rev.__class__ = cvs_revision_type_map[ + (isinstance(first_rev, CVSRevisionModification), False,) + ] + + # Now first_rev is a new root: + self.root_ids.add(first_rev.id) + + def adjust_ntdbrs(self, ntdbr_cvs_revs): + """Adjust the specified non-trunk default branch revisions. + + NTDBR_CVS_REVS is a list of CVSRevision instances in this file + that have been determined to be non-trunk default branch + revisions. + + The first revision on the default branch is handled strangely by + CVS. If a file is imported (as opposed to being added), CVS + creates a 1.1 revision, then creates a vendor branch 1.1.1 based + on 1.1, then creates a 1.1.1.1 revision that is identical to the + 1.1 revision (i.e., its deltatext is empty). The log message that + the user typed when importing is stored with the 1.1.1.1 revision. + The 1.1 revision always contains a standard, generated log + message, 'Initial revision\n'. + + When we detect a straightforward import like this, we want to + handle it by deleting the 1.1 revision (which doesn't contain any + useful information) and making 1.1.1.1 into an independent root in + the file's dependency tree. In SVN, 1.1.1.1 will be added + directly to the vendor branch with its initial content. Then in a + special 'post-commit', the 1.1.1.1 revision is copied back to + trunk. + + If the user imports again to the same vendor branch, then CVS + creates revisions 1.1.1.2, 1.1.1.3, etc. on the vendor branch, + *without* counterparts in trunk (even though these revisions + effectively play the role of trunk revisions). So after we add + such revisions to the vendor branch, we also copy them back to + trunk in post-commits. + + Set the ntdbr members of the revisions listed in NTDBR_CVS_REVS to + True. Also, if there is a 1.2 revision, then set that revision to + depend on the last non-trunk default branch revision and possibly + adjust its type accordingly.""" + + for cvs_rev in ntdbr_cvs_revs: + cvs_rev.ntdbr = True + + # Look for a 1.2 revision: + rev_1_1 = self[ntdbr_cvs_revs[0].prev_id] + + rev_1_2 = self.get(rev_1_1.next_id) + if rev_1_2 is not None: + # Revision 1.2 logically follows the imported revisions, not + # 1.1. Accordingly, connect it to the last NTDBR and possibly + # change its type. + last_ntdbr = ntdbr_cvs_revs[-1] + rev_1_2.ntdbr_prev_id = last_ntdbr.id + last_ntdbr.ntdbr_next_id = rev_1_2.id + rev_1_2.__class__ = cvs_revision_type_map[( + isinstance(rev_1_2, CVSRevisionModification), + isinstance(last_ntdbr, CVSRevisionModification), + )] + + def process_live_ntdb(self, vendor_lod_items): + """VENDOR_LOD_ITEMS is a live default branch; process it. + + In this case, all revisions on the default branch are NTDBRs and + it is an error if there is also a '1.2' revision. + + Return True iff this transformation really does something. Raise + a VendorBranchError if there is a '1.2' revision.""" + + rev_1_1 = self[vendor_lod_items.cvs_branch.source_id] + rev_1_2_id = rev_1_1.next_id + if rev_1_2_id is not None: + raise VendorBranchError( + 'File \'%s\' has default branch=%s but also a revision %s' + % (self.cvs_file.filename, + vendor_lod_items.cvs_branch.branch_number, self[rev_1_2_id].rev,) + ) + + ntdbr_cvs_revs = list(vendor_lod_items.cvs_revisions) + + if ntdbr_cvs_revs: + self.adjust_ntdbrs(ntdbr_cvs_revs) + return True + else: + return False + + def process_historical_ntdb(self, vendor_lod_items): + """There appears to have been a non-trunk default branch in the past. + + There is currently no default branch, but the branch described by + file appears to have been imported. So our educated guess is that + all revisions on the '1.1.1' branch (described by + VENDOR_LOD_ITEMS) with timestamps prior to the timestamp of '1.2' + were non-trunk default branch revisions. + + Return True iff this transformation really does something. + + This really only handles standard '1.1.1.*'-style vendor + revisions. One could conceivably have a file whose default branch + is 1.1.3 or whatever, or was that at some point in time, with + vendor revisions 1.1.3.1, 1.1.3.2, etc. But with the default + branch gone now, we'd have no basis for assuming that the + non-standard vendor branch had ever been the default branch + anyway. + + Note that we rely on comparisons between the timestamps of the + revisions on the vendor branch and that of revision 1.2, even + though the timestamps might be incorrect due to clock skew. We + could do a slightly better job if we used the changeset + timestamps, as it is possible that the dependencies that went into + determining those timestamps are more accurate. But that would + require an extra pass or two.""" + + rev_1_1 = self[vendor_lod_items.cvs_branch.source_id] + rev_1_2_id = rev_1_1.next_id + + if rev_1_2_id is None: + rev_1_2_timestamp = None + else: + rev_1_2_timestamp = self[rev_1_2_id].timestamp + + ntdbr_cvs_revs = [] + for cvs_rev in vendor_lod_items.cvs_revisions: + if rev_1_2_timestamp is not None \ + and cvs_rev.timestamp >= rev_1_2_timestamp: + # That's the end of the once-default branch. + break + ntdbr_cvs_revs.append(cvs_rev) + + if ntdbr_cvs_revs: + self.adjust_ntdbrs(ntdbr_cvs_revs) + return True + else: + return False + + def imported_remove_1_1(self, vendor_lod_items): + """This file was imported. Remove the 1.1 revision if possible. + + VENDOR_LOD_ITEMS is the LODItems instance for the vendor branch. + See adjust_ntdbrs() for more information.""" + + assert vendor_lod_items.cvs_revisions + cvs_rev = vendor_lod_items.cvs_revisions[0] + + if isinstance(cvs_rev, CVSRevisionModification) \ + and not cvs_rev.deltatext_exists: + cvs_branch = vendor_lod_items.cvs_branch + rev_1_1 = self[cvs_branch.source_id] + assert isinstance(rev_1_1, CVSRevision) + Log().debug('Removing unnecessary revision %s' % (rev_1_1,)) + + # Delete the 1.1.1 CVSBranch and sever the vendor branch from trunk: + self._sever_branch(vendor_lod_items) + + # Delete rev_1_1: + self.root_ids.remove(rev_1_1.id) + del self[rev_1_1.id] + rev_1_2_id = rev_1_1.next_id + if rev_1_2_id is not None: + rev_1_2 = self[rev_1_2_id] + rev_1_2.prev_id = None + self.root_ids.add(rev_1_2.id) + + # Move any tags and branches from rev_1_1 to cvs_rev: + cvs_rev.tag_ids.extend(rev_1_1.tag_ids) + for id in rev_1_1.tag_ids: + cvs_tag = self[id] + cvs_tag.source_lod = cvs_rev.lod + cvs_tag.source_id = cvs_rev.id + cvs_rev.branch_ids[0:0] = rev_1_1.branch_ids + for id in rev_1_1.branch_ids: + cvs_branch = self[id] + cvs_branch.source_lod = cvs_rev.lod + cvs_branch.source_id = cvs_rev.id + cvs_rev.branch_commit_ids[0:0] = rev_1_1.branch_commit_ids + for id in rev_1_1.branch_commit_ids: + cvs_rev2 = self[id] + cvs_rev2.prev_id = cvs_rev.id + + def _delete_unneeded(self, cvs_item, metadata_db): + if isinstance(cvs_item, CVSRevisionNoop) \ + and cvs_item.rev == '1.1' \ + and isinstance(cvs_item.lod, Trunk) \ + and len(cvs_item.branch_ids) >= 1 \ + and self[cvs_item.branch_ids[0]].next_id is not None \ + and not cvs_item.closed_symbols \ + and not cvs_item.ntdbr: + # FIXME: This message will not match if the RCS file was renamed + # manually after it was created. + log_msg = metadata_db[cvs_item.metadata_id].log_msg + cvs_generated_msg = 'file %s was initially added on branch %s.\n' % ( + self.cvs_file.basename, + self[cvs_item.branch_ids[0]].symbol.name,) + return log_msg == cvs_generated_msg + else: + return False + + def remove_unneeded_deletes(self, metadata_db): + """Remove unneeded deletes for this file. + + If a file is added on a branch, then a trunk revision is added at + the same time in the 'Dead' state. This revision doesn't do + anything useful, so delete it.""" + + for id in self.root_ids: + cvs_item = self[id] + if self._delete_unneeded(cvs_item, metadata_db): + Log().debug('Removing unnecessary delete %s' % (cvs_item,)) + + # Delete cvs_item: + self.root_ids.remove(cvs_item.id) + del self[id] + if cvs_item.next_id is not None: + cvs_rev_next = self[cvs_item.next_id] + cvs_rev_next.prev_id = None + self.root_ids.add(cvs_rev_next.id) + + # Delete all CVSBranches rooted at this revision. If there is + # a CVSRevision on the branch, it should already be an add so + # it doesn't have to be changed. + for cvs_branch_id in cvs_item.branch_ids: + cvs_branch = self[cvs_branch_id] + del self[cvs_branch.id] + + if cvs_branch.next_id is not None: + cvs_branch_next = self[cvs_branch.next_id] + cvs_branch_next.first_on_branch_id = None + cvs_branch_next.prev_id = None + self.root_ids.add(cvs_branch_next.id) + + # Tagging a dead revision doesn't do anything, so remove any + # tags that were set on 1.1: + for cvs_tag_id in cvs_item.tag_ids: + del self[cvs_tag_id] + + # This can only happen once per file, and we might have just + # changed self.root_ids, so break out of the loop: + break + + def _initial_branch_delete_unneeded(self, lod_items, metadata_db): + """Return True iff the initial revision in LOD_ITEMS can be deleted.""" + + if lod_items.cvs_branch is not None \ + and lod_items.cvs_branch.source_id is not None \ + and len(lod_items.cvs_revisions) >= 2: + cvs_revision = lod_items.cvs_revisions[0] + cvs_rev_source = self[lod_items.cvs_branch.source_id] + if isinstance(cvs_revision, CVSRevisionAbsent) \ + and not cvs_revision.tag_ids \ + and not cvs_revision.branch_ids \ + and abs(cvs_revision.timestamp - cvs_rev_source.timestamp) <= 2: + # FIXME: This message will not match if the RCS file was renamed + # manually after it was created. + log_msg = metadata_db[cvs_revision.metadata_id].log_msg + return bool(re.match( + r'file %s was added on branch .* on ' + r'\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2}\:\d{2}( [\+\-]\d{4})?' + '\n' % (re.escape(self.cvs_file.basename),), + log_msg, + )) + return False + + def remove_initial_branch_deletes(self, metadata_db): + """If the first revision on a branch is an unnecessary delete, remove it. + + If a file is added on a branch (whether or not it already existed + on trunk), then new versions of CVS add a first branch revision in + the 'dead' state (to indicate that the file did not exist on the + branch when the branch was created) followed by the second branch + revision, which is an add. When we encounter this situation, we + sever the branch from trunk and delete the first branch + revision.""" + + for lod_items in self.iter_lods(): + if self._initial_branch_delete_unneeded(lod_items, metadata_db): + cvs_revision = lod_items.cvs_revisions[0] + Log().debug( + 'Removing unnecessary initial branch delete %s' % (cvs_revision,) + ) + cvs_branch = lod_items.cvs_branch + cvs_rev_source = self[cvs_branch.source_id] + cvs_rev_next = lod_items.cvs_revisions[1] + + # Delete cvs_revision: + del self[cvs_revision.id] + cvs_rev_next.prev_id = None + self.root_ids.add(cvs_rev_next.id) + cvs_rev_source.branch_commit_ids.remove(cvs_revision.id) + + # Delete the CVSBranch on which it is located: + del self[cvs_branch.id] + cvs_rev_source.branch_ids.remove(cvs_branch.id) + + def _exclude_tag(self, cvs_tag): + """Exclude the specified CVS_TAG.""" + + del self[cvs_tag.id] + + # A CVSTag is the successor of the CVSRevision that it + # sprouts from. Delete this tag from that revision's + # tag_ids: + self[cvs_tag.source_id].tag_ids.remove(cvs_tag.id) + + def _exclude_branch(self, lod_items): + """Exclude the branch described by LOD_ITEMS, including its revisions. + + (Do not update the LOD_ITEMS instance itself.) + + If the LOD starts with non-trunk default branch revisions, leave + the branch and the NTDB revisions in place, but delete any + subsequent revisions that are not NTDB revisions. In this case, + return True; otherwise return False""" + + if lod_items.cvs_revisions and lod_items.cvs_revisions[0].ntdbr: + for cvs_rev in lod_items.cvs_revisions: + if not cvs_rev.ntdbr: + # We've found the first non-NTDBR, and it's stored in cvs_rev: + break + else: + # There was no revision following the NTDBRs: + cvs_rev = None + + if cvs_rev: + last_ntdbr = self[cvs_rev.prev_id] + last_ntdbr.next_id = None + while True: + del self[cvs_rev.id] + if cvs_rev.next_id is None: + break + cvs_rev = self[cvs_rev.next_id] + + return True + + else: + if lod_items.cvs_branch is not None: + # Delete the CVSBranch itself: + cvs_branch = lod_items.cvs_branch + + del self[cvs_branch.id] + + # A CVSBranch is the successor of the CVSRevision that it + # sprouts from. Delete this branch from that revision's + # branch_ids: + self[cvs_branch.source_id].branch_ids.remove(cvs_branch.id) + + if lod_items.cvs_revisions: + # The first CVSRevision on the branch has to be either detached + # from the revision from which the branch sprang, or removed + # from self.root_ids: + cvs_rev = lod_items.cvs_revisions[0] + if cvs_rev.prev_id is None: + self.root_ids.remove(cvs_rev.id) + else: + self[cvs_rev.prev_id].branch_commit_ids.remove(cvs_rev.id) + + for cvs_rev in lod_items.cvs_revisions: + del self[cvs_rev.id] + + return False + + def graft_ntdbr_to_trunk(self): + """Graft the non-trunk default branch revisions to trunk. + + They should already be alone on a branch that may or may not have + a CVSBranch connecting it to trunk.""" + + for lod_items in self.iter_lods(): + if lod_items.cvs_revisions and lod_items.cvs_revisions[0].ntdbr: + assert lod_items.is_pure_ntdb() + + first_rev = lod_items.cvs_revisions[0] + last_rev = lod_items.cvs_revisions[-1] + rev_1_1 = self.get(first_rev.prev_id) + rev_1_2 = self.get(last_rev.ntdbr_next_id) + + if lod_items.cvs_branch is not None: + self._sever_branch(lod_items) + + if rev_1_1 is not None: + rev_1_1.next_id = first_rev.id + first_rev.prev_id = rev_1_1.id + + self.root_ids.remove(first_rev.id) + + first_rev.__class__ = cvs_revision_type_map[( + isinstance(first_rev, CVSRevisionModification), + isinstance(rev_1_1, CVSRevisionModification), + )] + + if rev_1_2 is not None: + rev_1_2.ntdbr_prev_id = None + last_rev.ntdbr_next_id = None + + if rev_1_2.prev_id is None: + self.root_ids.remove(rev_1_2.id) + + rev_1_2.prev_id = last_rev.id + last_rev.next_id = rev_1_2.id + + # The effective_pred_id of rev_1_2 was not changed, so we + # don't have to change rev_1_2's type. + + for cvs_rev in lod_items.cvs_revisions: + cvs_rev.ntdbr = False + cvs_rev.lod = self.trunk + + for cvs_branch in lod_items.cvs_branches: + cvs_branch.source_lod = self.trunk + + for cvs_tag in lod_items.cvs_tags: + cvs_tag.source_lod = self.trunk + + return + + def exclude_non_trunk(self): + """Delete all tags and branches.""" + + ntdbr_excluded = False + for lod_items in self.iter_lods(): + for cvs_tag in lod_items.cvs_tags[:]: + self._exclude_tag(cvs_tag) + lod_items.cvs_tags.remove(cvs_tag) + + if not isinstance(lod_items.lod, Trunk): + assert not lod_items.cvs_branches + + ntdbr_excluded |= self._exclude_branch(lod_items) + + if ntdbr_excluded: + self.graft_ntdbr_to_trunk() + + def filter_excluded_symbols(self, revision_excluder): + """Delete any excluded symbols and references to them. + + Call the revision_excluder's callback methods to let it know what + is being excluded.""" + + ntdbr_excluded = False + for lod_items in self.iter_lods(): + # Delete any excluded tags: + for cvs_tag in lod_items.cvs_tags[:]: + if isinstance(cvs_tag.symbol, ExcludedSymbol): + self._exclude_tag(cvs_tag) + + lod_items.cvs_tags.remove(cvs_tag) + + # Delete the whole branch if it is to be excluded: + if isinstance(lod_items.lod, ExcludedSymbol): + # A symbol can only be excluded if no other symbols spring + # from it. This was already checked in CollateSymbolsPass, so + # these conditions should already be satisfied. + assert not list(lod_items.iter_blockers()) + + ntdbr_excluded |= self._exclude_branch(lod_items) + + if ntdbr_excluded: + self.graft_ntdbr_to_trunk() + + revision_excluder.process_file(self) + + def _mutate_branch_to_tag(self, cvs_branch): + """Mutate the branch CVS_BRANCH into a tag.""" + + if cvs_branch.next_id is not None: + # This shouldn't happen because it was checked in + # CollateSymbolsPass: + raise FatalError('Attempt to exclude a branch with commits.') + cvs_tag = CVSTag( + cvs_branch.id, cvs_branch.cvs_file, cvs_branch.symbol, + cvs_branch.source_lod, cvs_branch.source_id, + cvs_branch.revision_recorder_token, + ) + self.add(cvs_tag) + cvs_revision = self[cvs_tag.source_id] + cvs_revision.branch_ids.remove(cvs_tag.id) + cvs_revision.tag_ids.append(cvs_tag.id) + + def _mutate_tag_to_branch(self, cvs_tag): + """Mutate the tag into a branch.""" + + cvs_branch = CVSBranch( + cvs_tag.id, cvs_tag.cvs_file, cvs_tag.symbol, + None, cvs_tag.source_lod, cvs_tag.source_id, None, + cvs_tag.revision_recorder_token, + ) + self.add(cvs_branch) + cvs_revision = self[cvs_branch.source_id] + cvs_revision.tag_ids.remove(cvs_branch.id) + cvs_revision.branch_ids.append(cvs_branch.id) + + def _mutate_symbol(self, cvs_symbol): + """Mutate CVS_SYMBOL if necessary.""" + + symbol = cvs_symbol.symbol + if isinstance(cvs_symbol, CVSBranch) and isinstance(symbol, Tag): + self._mutate_branch_to_tag(cvs_symbol) + elif isinstance(cvs_symbol, CVSTag) and isinstance(symbol, Branch): + self._mutate_tag_to_branch(cvs_symbol) + + def mutate_symbols(self): + """Force symbols to be tags/branches based on self.symbol_db.""" + + for cvs_item in self.values(): + if isinstance(cvs_item, CVSRevision): + # This CVSRevision may be affected by the mutation of any + # CVSSymbols that it references, but there is nothing to do + # here directly. + pass + elif isinstance(cvs_item, CVSSymbol): + self._mutate_symbol(cvs_item) + else: + raise RuntimeError('Unknown cvs item type') + + def _adjust_tag_parent(self, cvs_tag): + """Adjust the parent of CVS_TAG if possible and preferred. + + CVS_TAG is an instance of CVSTag. This method must be called in + leaf-to-trunk order.""" + + # The Symbol that cvs_tag would like to have as a parent: + preferred_parent = Ctx()._symbol_db.get_symbol( + cvs_tag.symbol.preferred_parent_id) + + if cvs_tag.source_lod == preferred_parent: + # The preferred parent is already the parent. + return + + # The CVSRevision that is its direct parent: + source = self[cvs_tag.source_id] + assert isinstance(source, CVSRevision) + + if isinstance(preferred_parent, Trunk): + # It is not possible to graft *onto* Trunk: + return + + # Try to find the preferred parent among the possible parents: + for branch_id in source.branch_ids: + if self[branch_id].symbol == preferred_parent: + # We found it! + break + else: + # The preferred parent is not a possible parent in this file. + return + + parent = self[branch_id] + assert isinstance(parent, CVSBranch) + + Log().debug('Grafting %s from %s (on %s) onto %s' % ( + cvs_tag, source, source.lod, parent,)) + # Switch parent: + source.tag_ids.remove(cvs_tag.id) + parent.tag_ids.append(cvs_tag.id) + cvs_tag.source_lod = parent.symbol + cvs_tag.source_id = parent.id + + def _adjust_branch_parents(self, cvs_branch): + """Adjust the parent of CVS_BRANCH if possible and preferred. + + CVS_BRANCH is an instance of CVSBranch. This method must be + called in leaf-to-trunk order.""" + + # The Symbol that cvs_branch would like to have as a parent: + preferred_parent = Ctx()._symbol_db.get_symbol( + cvs_branch.symbol.preferred_parent_id) + + if cvs_branch.source_lod == preferred_parent: + # The preferred parent is already the parent. + return + + # The CVSRevision that is its direct parent: + source = self[cvs_branch.source_id] + # This is always a CVSRevision because we haven't adjusted it yet: + assert isinstance(source, CVSRevision) + + if isinstance(preferred_parent, Trunk): + # It is not possible to graft *onto* Trunk: + return + + # Try to find the preferred parent among the possible parents: + for branch_id in source.branch_ids: + possible_parent = self[branch_id] + if possible_parent.symbol == preferred_parent: + # We found it! + break + elif possible_parent.symbol == cvs_branch.symbol: + # Only branches that precede the branch to be adjusted are + # considered possible parents. Leave parentage unchanged: + return + else: + # This point should never be reached. + raise InternalError( + 'Possible parent search did not terminate as expected') + + parent = possible_parent + assert isinstance(parent, CVSBranch) + + Log().debug('Grafting %s from %s (on %s) onto %s' % ( + cvs_branch, source, source.lod, parent,)) + # Switch parent: + source.branch_ids.remove(cvs_branch.id) + parent.branch_ids.append(cvs_branch.id) + cvs_branch.source_lod = parent.symbol + cvs_branch.source_id = parent.id + + def adjust_parents(self): + """Adjust the parents of symbols to their preferred parents. + + If a CVSSymbol has a preferred parent that is different than its + current parent, and if the preferred parent is an allowed parent + of the CVSSymbol in this file, then graft the CVSSymbol onto its + preferred parent.""" + + for lod_items in self.iter_lods(): + for cvs_tag in lod_items.cvs_tags: + self._adjust_tag_parent(cvs_tag) + + for cvs_branch in lod_items.cvs_branches: + self._adjust_branch_parents(cvs_branch) + + def _get_revision_source(self, cvs_symbol): + """Return the CVSRevision that is the ultimate source of CVS_SYMBOL.""" + + while True: + cvs_item = self[cvs_symbol.source_id] + if isinstance(cvs_item, CVSRevision): + return cvs_item + else: + cvs_symbol = cvs_item + + def refine_symbols(self): + """Refine the types of the CVSSymbols in this file. + + Adjust the symbol types based on whether the source exists: + CVSBranch vs. CVSBranchNoop and CVSTag vs. CVSTagNoop.""" + + for lod_items in self.iter_lods(): + for cvs_tag in lod_items.cvs_tags: + source = self._get_revision_source(cvs_tag) + cvs_tag.__class__ = cvs_tag_type_map[ + isinstance(source, CVSRevisionModification) + ] + + for cvs_branch in lod_items.cvs_branches: + source = self._get_revision_source(cvs_branch) + cvs_branch.__class__ = cvs_branch_type_map[ + isinstance(source, CVSRevisionModification) + ] + + def record_opened_symbols(self): + """Set CVSRevision.opened_symbols for the surviving revisions.""" + + for cvs_item in self.values(): + if isinstance(cvs_item, (CVSRevision, CVSBranch)): + cvs_item.opened_symbols = [] + for cvs_symbol_opened_id in cvs_item.get_cvs_symbol_ids_opened(): + cvs_symbol_opened = self[cvs_symbol_opened_id] + cvs_item.opened_symbols.append( + (cvs_symbol_opened.symbol.id, cvs_symbol_opened.id,) + ) + + def record_closed_symbols(self): + """Set CVSRevision.closed_symbols for the surviving revisions. + + A CVSRevision closes the symbols that were opened by the CVSItems + that the CVSRevision closes. Got it? + + This method must be called after record_opened_symbols().""" + + for cvs_item in self.values(): + if isinstance(cvs_item, CVSRevision): + cvs_item.closed_symbols = [] + for cvs_item_closed_id in cvs_item.get_ids_closed(): + cvs_item_closed = self[cvs_item_closed_id] + cvs_item.closed_symbols.extend(cvs_item_closed.opened_symbols) + + diff --git a/cvs2svn_lib/cvs_item.py b/cvs2svn_lib/cvs_item.py new file mode 100644 index 0000000..5c01a24 --- /dev/null +++ b/cvs2svn_lib/cvs_item.py @@ -0,0 +1,901 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to store atomic CVS events. + +A CVSItem is a single event, pertaining to a single file, that can be +determined to have occured based on the information in the CVS +repository. + +The inheritance tree is as follows: + +CVSItem +| ++--CVSRevision +| | +| +--CVSRevisionModification (* -> 'Exp') +| | | +| | +--CVSRevisionAdd ('dead' -> 'Exp') +| | | +| | +--CVSRevisionChange ('Exp' -> 'Exp') +| | +| +--CVSRevisionAbsent (* -> 'dead') +| | +| +--CVSRevisionDelete ('Exp' -> 'dead') +| | +| +--CVSRevisionNoop ('dead' -> 'dead') +| ++--CVSSymbol + | + +--CVSBranch + | | + | +--CVSBranchNoop + | + +--CVSTag + | + +--CVSTagNoop + +""" + + +from cvs2svn_lib.context import Ctx + + +class CVSItem(object): + __slots__ = [ + 'id', + 'cvs_file', + 'revision_recorder_token', + ] + + def __init__(self, id, cvs_file, revision_recorder_token): + self.id = id + self.cvs_file = cvs_file + self.revision_recorder_token = revision_recorder_token + + def __eq__(self, other): + return self.id == other.id + + def __cmp__(self, other): + return cmp(self.id, other.id) + + def __hash__(self): + return self.id + + def __getstate__(self): + raise NotImplementedError() + + def __setstate__(self, data): + raise NotImplementedError() + + def get_svn_path(self): + """Return the SVN path associated with this CVSItem.""" + + raise NotImplementedError() + + def get_pred_ids(self): + """Return the CVSItem.ids of direct predecessors of SELF. + + A predecessor is defined to be a CVSItem that has to have been + committed before this one.""" + + raise NotImplementedError() + + def get_succ_ids(self): + """Return the CVSItem.ids of direct successors of SELF. + + A direct successor is defined to be a CVSItem that has this one as + a direct predecessor.""" + + raise NotImplementedError() + + def get_cvs_symbol_ids_opened(self): + """Return an iterable over the ids of CVSSymbols that this item opens. + + The definition of 'open' is that the path corresponding to this + CVSItem will have to be copied when filling the corresponding + symbol.""" + + raise NotImplementedError() + + def get_ids_closed(self): + """Return an iterable over the CVSItem.ids of CVSItems closed by this one. + + A CVSItem A is said to close a CVSItem B if committing A causes B + to be overwritten or deleted (no longer available) in the SVN + repository. This is interesting because it sets the last SVN + revision number from which the contents of B can be copied (for + example, to fill a symbol). See the concrete implementations of + this method for the exact rules about what closes what.""" + + raise NotImplementedError() + + def check_links(self, cvs_file_items): + """Check for consistency of links to other CVSItems. + + Other items can be looked up in CVS_FILE_ITEMS, which is an + instance of CVSFileItems. Raise an AssertionError if there is a + problem.""" + + raise NotImplementedError() + + def __repr__(self): + return '%s(%s)' % (self.__class__.__name__, self,) + + +class CVSRevision(CVSItem): + """Information about a single CVS revision. + + A CVSRevision holds the information known about a single version of + a single file. + + Members: + + id -- (int) unique ID for this revision. + + cvs_file -- (CVSFile) CVSFile affected by this revision. + + timestamp -- (int) date stamp for this revision. + + metadata_id -- (int) id of metadata instance record in + metadata_db. + + prev_id -- (int) id of the logically previous CVSRevision, either + on the same or the source branch (or None). + + next_id -- (int) id of the logically next CVSRevision (or None). + + rev -- (string) the CVS revision number, e.g., '1.3'. + + deltatext_exists -- (bool) true iff this revision's deltatext is + not empty. + + lod -- (LineOfDevelopment) LOD on which this revision occurred. + + first_on_branch_id -- (int or None) if this revision is the first + on its branch, the cvs_branch_id of that branch; else, None. + + ntdbr -- (bool) true iff this is a non-trunk default branch + revision. + + ntdbr_prev_id -- (int or None) Iff this is the 1.2 revision after + the end of a default branch, the id of the last rev on the + default branch; else, None. + + ntdbr_next_id -- (int or None) Iff this is the last revision on a + default branch preceding a 1.2 rev, the id of the 1.2 + revision; else, None. + + tag_ids -- (list of int) ids of all CVSTags rooted at this + CVSRevision. + + branch_ids -- (list of int) ids of all CVSBranches rooted at this + CVSRevision. + + branch_commit_ids -- (list of int) ids of first CVSRevision + committed on each branch rooted in this revision (for branches + with commits). + + opened_symbols -- (None or list of (symbol_id, cvs_symbol_id) + tuples) information about all CVSSymbols opened by this + revision. This member is set in FilterSymbolsPass; before + then, it is None. + + closed_symbols -- (None or list of (symbol_id, cvs_symbol_id) + tuples) information about all CVSSymbols closed by this + revision. This member is set in FilterSymbolsPass; before + then, it is None. + + revision_recorder_token -- (arbitrary) a token that can be set by + RevisionRecorder for the later use of RevisionReader. + + """ + + __slots__ = [ + 'timestamp', + 'metadata_id', + 'prev_id', + 'next_id', + 'rev', + 'deltatext_exists', + 'lod', + 'first_on_branch_id', + 'ntdbr', + 'ntdbr_prev_id', + 'ntdbr_next_id', + 'tag_ids', + 'branch_ids', + 'branch_commit_ids', + 'opened_symbols', + 'closed_symbols', + ] + + def __init__(self, + id, cvs_file, + timestamp, metadata_id, + prev_id, next_id, + rev, deltatext_exists, + lod, first_on_branch_id, ntdbr, + ntdbr_prev_id, ntdbr_next_id, + tag_ids, branch_ids, branch_commit_ids, + revision_recorder_token): + """Initialize a new CVSRevision object.""" + + CVSItem.__init__(self, id, cvs_file, revision_recorder_token) + + self.timestamp = timestamp + self.metadata_id = metadata_id + self.prev_id = prev_id + self.next_id = next_id + self.rev = rev + self.deltatext_exists = deltatext_exists + self.lod = lod + self.first_on_branch_id = first_on_branch_id + self.ntdbr = ntdbr + self.ntdbr_prev_id = ntdbr_prev_id + self.ntdbr_next_id = ntdbr_next_id + self.tag_ids = tag_ids + self.branch_ids = branch_ids + self.branch_commit_ids = branch_commit_ids + self.opened_symbols = None + self.closed_symbols = None + + def _get_cvs_path(self): + return self.cvs_file.cvs_path + + cvs_path = property(_get_cvs_path) + + def get_svn_path(self): + return self.lod.get_path(self.cvs_file.cvs_path) + + def __getstate__(self): + """Return the contents of this instance, for pickling. + + The presence of this method improves the space efficiency of + pickling CVSRevision instances.""" + + return ( + self.id, self.cvs_file.id, + self.timestamp, self.metadata_id, + self.prev_id, self.next_id, + self.rev, + self.deltatext_exists, + self.lod.id, + self.first_on_branch_id, + self.ntdbr, + self.ntdbr_prev_id, self.ntdbr_next_id, + self.tag_ids, self.branch_ids, self.branch_commit_ids, + self.opened_symbols, self.closed_symbols, + self.revision_recorder_token, + ) + + def __setstate__(self, data): + (self.id, cvs_file_id, + self.timestamp, self.metadata_id, + self.prev_id, self.next_id, + self.rev, + self.deltatext_exists, + lod_id, + self.first_on_branch_id, + self.ntdbr, + self.ntdbr_prev_id, self.ntdbr_next_id, + self.tag_ids, self.branch_ids, self.branch_commit_ids, + self.opened_symbols, self.closed_symbols, + self.revision_recorder_token) = data + self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id) + self.lod = Ctx()._symbol_db.get_symbol(lod_id) + + def get_effective_prev_id(self): + """Return the ID of the effective predecessor of this item. + + This is the ID of the item that determines whether the object + existed before this CVSRevision.""" + + if self.ntdbr_prev_id is not None: + return self.ntdbr_prev_id + else: + return self.prev_id + + def get_symbol_pred_ids(self): + """Return the pred_ids for symbol predecessors.""" + + retval = set() + if self.first_on_branch_id is not None: + retval.add(self.first_on_branch_id) + return retval + + def get_pred_ids(self): + retval = self.get_symbol_pred_ids() + if self.prev_id is not None: + retval.add(self.prev_id) + if self.ntdbr_prev_id is not None: + retval.add(self.ntdbr_prev_id) + return retval + + def get_symbol_succ_ids(self): + """Return the succ_ids for symbol successors.""" + + retval = set() + for id in self.branch_ids + self.tag_ids: + retval.add(id) + return retval + + def get_succ_ids(self): + retval = self.get_symbol_succ_ids() + if self.next_id is not None: + retval.add(self.next_id) + if self.ntdbr_next_id is not None: + retval.add(self.ntdbr_next_id) + for id in self.branch_commit_ids: + retval.add(id) + return retval + + def get_ids_closed(self): + # Special handling is needed in the case of non-trunk default + # branches. The following cases have to be handled: + # + # Case 1: Revision 1.1 not deleted; revision 1.2 exists: + # + # 1.1 -----------------> 1.2 + # \ ^ ^ / + # \ | | / + # 1.1.1.1 -> 1.1.1.2 + # + # * 1.1.1.1 closes 1.1 (because its post-commit overwrites 1.1 + # on trunk) + # + # * 1.1.1.2 closes 1.1.1.1 + # + # * 1.2 doesn't close anything (the post-commit from 1.1.1.1 + # already closed 1.1, and no symbols can sprout from the + # post-commit of 1.1.1.2) + # + # Case 2: Revision 1.1 not deleted; revision 1.2 does not exist: + # + # 1.1 .................. + # \ ^ ^ + # \ | | + # 1.1.1.1 -> 1.1.1.2 + # + # * 1.1.1.1 closes 1.1 (because its post-commit overwrites 1.1 + # on trunk) + # + # * 1.1.1.2 closes 1.1.1.1 + # + # Case 3: Revision 1.1 deleted; revision 1.2 exists: + # + # ............... 1.2 + # ^ ^ / + # | | / + # 1.1.1.1 -> 1.1.1.2 + # + # * 1.1.1.1 doesn't close anything + # + # * 1.1.1.2 closes 1.1.1.1 + # + # * 1.2 doesn't close anything (no symbols can sprout from the + # post-commit of 1.1.1.2) + # + # Case 4: Revision 1.1 deleted; revision 1.2 doesn't exist: + # + # ............... + # ^ ^ + # | | + # 1.1.1.1 -> 1.1.1.2 + # + # * 1.1.1.1 doesn't close anything + # + # * 1.1.1.2 closes 1.1.1.1 + + if self.first_on_branch_id is not None: + # The first CVSRevision on a branch is considered to close the + # branch: + yield self.first_on_branch_id + if self.ntdbr: + # If the 1.1 revision was not deleted, the 1.1.1.1 revision is + # considered to close it: + yield self.prev_id + elif self.ntdbr_prev_id is not None: + # This is the special case of a 1.2 revision that follows a + # non-trunk default branch. Either 1.1 was deleted or the first + # default branch revision closed 1.1, so we don't have to close + # 1.1. Technically, we close the revision on trunk that was + # copied from the last non-trunk default branch revision in a + # post-commit, but for now no symbols can sprout from that + # revision so we ignore that one, too. + pass + elif self.prev_id is not None: + # Since this CVSRevision is not the first on a branch, its + # prev_id is on the same LOD and this item closes that one: + yield self.prev_id + + def _get_branch_ids_recursively(self, cvs_file_items): + """Return the set of all CVSBranches that sprout from this CVSRevision. + + After parent adjustment in FilterSymbolsPass, it is possible for + branches to sprout directly from a CVSRevision, or from those + branches, etc. Return all branches that sprout from this + CVSRevision, directly or indirectly.""" + + retval = set() + branch_ids_to_process = list(self.branch_ids) + while branch_ids_to_process: + branch = cvs_file_items[branch_ids_to_process.pop()] + retval.add(branch) + branch_ids_to_process.extend(branch.branch_ids) + + return retval + + def check_links(self, cvs_file_items): + assert self.cvs_file == cvs_file_items.cvs_file + + prev = cvs_file_items.get(self.prev_id) + next = cvs_file_items.get(self.next_id) + first_on_branch = cvs_file_items.get(self.first_on_branch_id) + ntdbr_next = cvs_file_items.get(self.ntdbr_next_id) + ntdbr_prev = cvs_file_items.get(self.ntdbr_prev_id) + effective_prev = cvs_file_items.get(self.get_effective_prev_id()) + + if prev is None: + # This is the first CVSRevision on trunk or a detached branch: + assert self.id in cvs_file_items.root_ids + elif first_on_branch is not None: + # This is the first CVSRevision on an existing branch: + assert isinstance(first_on_branch, CVSBranch) + assert first_on_branch.symbol == self.lod + assert first_on_branch.next_id == self.id + cvs_revision_source = first_on_branch.get_cvs_revision_source( + cvs_file_items + ) + assert cvs_revision_source.id == prev.id + assert self.id in prev.branch_commit_ids + else: + # This revision follows another revision on the same LOD: + assert prev.next_id == self.id + assert prev.lod == self.lod + + if next is not None: + assert next.prev_id == self.id + assert next.lod == self.lod + + if ntdbr_next is not None: + assert self.ntdbr + assert ntdbr_next.ntdbr_prev_id == self.id + + if ntdbr_prev is not None: + assert ntdbr_prev.ntdbr_next_id == self.id + + for tag_id in self.tag_ids: + tag = cvs_file_items[tag_id] + assert isinstance(tag, CVSTag) + assert tag.source_id == self.id + assert tag.source_lod == self.lod + + for branch_id in self.branch_ids: + branch = cvs_file_items[branch_id] + assert isinstance(branch, CVSBranch) + assert branch.source_id == self.id + assert branch.source_lod == self.lod + + branch_commit_ids = list(self.branch_commit_ids) + + for branch in self._get_branch_ids_recursively(cvs_file_items): + assert isinstance(branch, CVSBranch) + if branch.next_id is not None: + assert branch.next_id in branch_commit_ids + branch_commit_ids.remove(branch.next_id) + + assert not branch_commit_ids + + assert self.__class__ == cvs_revision_type_map[( + isinstance(self, CVSRevisionModification), + effective_prev is not None + and isinstance(effective_prev, CVSRevisionModification), + )] + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s:%s<%x>' % (self.cvs_file, self.rev, self.id,) + + +class CVSRevisionModification(CVSRevision): + """Base class for CVSRevisionAdd or CVSRevisionChange.""" + + __slots__ = [] + + def get_cvs_symbol_ids_opened(self): + return self.tag_ids + self.branch_ids + + +class CVSRevisionAdd(CVSRevisionModification): + """A CVSRevision that creates a file that previously didn't exist. + + The file might have never existed on this LOD, or it might have + existed previously but been deleted by a CVSRevisionDelete.""" + + __slots__ = [] + + +class CVSRevisionChange(CVSRevisionModification): + """A CVSRevision that modifies a file that already existed on this LOD.""" + + __slots__ = [] + + +class CVSRevisionAbsent(CVSRevision): + """A CVSRevision for which the file is nonexistent on this LOD.""" + + __slots__ = [] + + def get_cvs_symbol_ids_opened(self): + return [] + + +class CVSRevisionDelete(CVSRevisionAbsent): + """A CVSRevision that deletes a file that existed on this LOD.""" + + __slots__ = [] + + +class CVSRevisionNoop(CVSRevisionAbsent): + """A CVSRevision that doesn't do anything. + + The revision was 'dead' and the predecessor either didn't exist or + was also 'dead'. These revisions can't necessarily be thrown away + because (1) they impose ordering constraints on other items; (2) + they might have a nontrivial log message that we don't want to throw + away.""" + + __slots__ = [] + + +# A map +# +# {(nondead(cvs_rev), nondead(prev_cvs_rev)) : cvs_revision_subtype} +# +# , where nondead() means that the cvs revision exists and is not +# 'dead', and CVS_REVISION_SUBTYPE is the subtype of CVSRevision that +# should be used for CVS_REV. +cvs_revision_type_map = { + (False, False) : CVSRevisionNoop, + (False, True) : CVSRevisionDelete, + (True, False) : CVSRevisionAdd, + (True, True) : CVSRevisionChange, + } + + +class CVSSymbol(CVSItem): + """Represent a symbol on a particular CVSFile. + + This is the base class for CVSBranch and CVSTag. + + Members: + + id -- (int) unique ID for this item. + + cvs_file -- (CVSFile) CVSFile affected by this item. + + symbol -- (Symbol) the symbol affected by this CVSSymbol. + + source_lod -- (LineOfDevelopment) the LOD that is the source for + this CVSSymbol. + + source_id -- (int) the ID of the CVSRevision or CVSBranch that is + the source for this item. This initially points to a + CVSRevision, but can be changed to a CVSBranch via parent + adjustment in FilterSymbolsPass. + + revision_recorder_token -- (arbitrary) a token that can be set by + RevisionRecorder for the later use of RevisionReader. + + """ + + __slots__ = [ + 'symbol', + 'source_lod', + 'source_id', + ] + + def __init__( + self, id, cvs_file, symbol, source_lod, source_id, + revision_recorder_token + ): + """Initialize a CVSSymbol object.""" + + CVSItem.__init__(self, id, cvs_file, revision_recorder_token) + + self.symbol = symbol + self.source_lod = source_lod + self.source_id = source_id + + def get_cvs_revision_source(self, cvs_file_items): + """Return the CVSRevision that is the ultimate source of this symbol.""" + + cvs_source = cvs_file_items[self.source_id] + while not isinstance(cvs_source, CVSRevision): + cvs_source = cvs_file_items[cvs_source.source_id] + + return cvs_source + + def get_svn_path(self): + return self.symbol.get_path(self.cvs_file.cvs_path) + + def get_ids_closed(self): + # A Symbol does not close any other CVSItems: + return [] + + +class CVSBranch(CVSSymbol): + """Represent the creation of a branch in a particular CVSFile. + + Members: + + id -- (int) unique ID for this item. + + cvs_file -- (CVSFile) CVSFile affected by this item. + + symbol -- (Symbol) the symbol affected by this CVSSymbol. + + branch_number -- (string) the number of this branch (e.g., + '1.3.4'), or None if this is a converted CVSTag. + + source_lod -- (LineOfDevelopment) the LOD that is the source for + this CVSSymbol. + + source_id -- (int) id of the CVSRevision or CVSBranch from which + this branch sprouts. This initially points to a CVSRevision, + but can be changed to a CVSBranch via parent adjustment in + FilterSymbolsPass. + + next_id -- (int or None) id of first CVSRevision on this branch, + if any; else, None. + + tag_ids -- (list of int) ids of all CVSTags rooted at this + CVSBranch (can be set due to parent adjustment in + FilterSymbolsPass). + + branch_ids -- (list of int) ids of all CVSBranches rooted at this + CVSBranch (can be set due to parent adjustment in + FilterSymbolsPass). + + opened_symbols -- (None or list of (symbol_id, cvs_symbol_id) + tuples) information about all CVSSymbols opened by this + branch. This member is set in FilterSymbolsPass; before then, + it is None. + + revision_recorder_token -- (arbitrary) a token that can be set by + RevisionRecorder for the later use of RevisionReader. + + """ + + __slots__ = [ + 'branch_number', + 'next_id', + 'tag_ids', + 'branch_ids', + 'opened_symbols', + ] + + def __init__( + self, id, cvs_file, symbol, branch_number, + source_lod, source_id, next_id, + revision_recorder_token, + ): + """Initialize a CVSBranch.""" + + CVSSymbol.__init__( + self, id, cvs_file, symbol, source_lod, source_id, + revision_recorder_token + ) + self.branch_number = branch_number + self.next_id = next_id + self.tag_ids = [] + self.branch_ids = [] + self.opened_symbols = None + + def __getstate__(self): + return ( + self.id, self.cvs_file.id, + self.symbol.id, self.branch_number, + self.source_lod.id, self.source_id, self.next_id, + self.tag_ids, self.branch_ids, + self.opened_symbols, + self.revision_recorder_token, + ) + + def __setstate__(self, data): + ( + self.id, cvs_file_id, + symbol_id, self.branch_number, + source_lod_id, self.source_id, self.next_id, + self.tag_ids, self.branch_ids, + self.opened_symbols, + self.revision_recorder_token, + ) = data + self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id) + self.symbol = Ctx()._symbol_db.get_symbol(symbol_id) + self.source_lod = Ctx()._symbol_db.get_symbol(source_lod_id) + + def get_pred_ids(self): + return set([self.source_id]) + + def get_succ_ids(self): + retval = set(self.tag_ids + self.branch_ids) + if self.next_id is not None: + retval.add(self.next_id) + return retval + + def get_cvs_symbol_ids_opened(self): + return self.tag_ids + self.branch_ids + + def check_links(self, cvs_file_items): + source = cvs_file_items.get(self.source_id) + next = cvs_file_items.get(self.next_id) + + assert self.id in source.branch_ids + if isinstance(source, CVSRevision): + assert self.source_lod == source.lod + elif isinstance(source, CVSBranch): + assert self.source_lod == source.symbol + else: + assert False + + if next is not None: + assert isinstance(next, CVSRevision) + assert next.lod == self.symbol + assert next.first_on_branch_id == self.id + + for tag_id in self.tag_ids: + tag = cvs_file_items[tag_id] + assert isinstance(tag, CVSTag) + assert tag.source_id == self.id + assert tag.source_lod == self.symbol + + for branch_id in self.branch_ids: + branch = cvs_file_items[branch_id] + assert isinstance(branch, CVSBranch) + assert branch.source_id == self.id + assert branch.source_lod == self.symbol + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s:%s:%s<%x>' \ + % (self.cvs_file, self.symbol, self.branch_number, self.id,) + + +class CVSBranchNoop(CVSBranch): + """A CVSBranch whose source is a CVSRevisionAbsent.""" + + __slots__ = [] + + def get_cvs_symbol_ids_opened(self): + return [] + + +# A map +# +# {nondead(source_cvs_rev) : cvs_branch_subtype} +# +# , where nondead() means that the cvs revision exists and is not +# 'dead', and CVS_BRANCH_SUBTYPE is the subtype of CVSBranch that +# should be used. +cvs_branch_type_map = { + False : CVSBranchNoop, + True : CVSBranch, + } + + +class CVSTag(CVSSymbol): + """Represent the creation of a tag on a particular CVSFile. + + Members: + + id -- (int) unique ID for this item. + + cvs_file -- (CVSFile) CVSFile affected by this item. + + symbol -- (Symbol) the symbol affected by this CVSSymbol. + + source_lod -- (LineOfDevelopment) the LOD that is the source for + this CVSSymbol. + + source_id -- (int) the ID of the CVSRevision or CVSBranch that is + being tagged. This initially points to a CVSRevision, but can + be changed to a CVSBranch via parent adjustment in + FilterSymbolsPass. + + revision_recorder_token -- (arbitrary) a token that can be set by + RevisionRecorder for the later use of RevisionReader. + + """ + + __slots__ = [] + + def __init__( + self, id, cvs_file, symbol, source_lod, source_id, + revision_recorder_token, + ): + """Initialize a CVSTag.""" + + CVSSymbol.__init__( + self, id, cvs_file, symbol, source_lod, source_id, + revision_recorder_token, + ) + + def __getstate__(self): + return ( + self.id, self.cvs_file.id, self.symbol.id, + self.source_lod.id, self.source_id, + self.revision_recorder_token, + ) + + def __setstate__(self, data): + ( + self.id, cvs_file_id, symbol_id, source_lod_id, self.source_id, + self.revision_recorder_token, + ) = data + self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id) + self.symbol = Ctx()._symbol_db.get_symbol(symbol_id) + self.source_lod = Ctx()._symbol_db.get_symbol(source_lod_id) + + def get_pred_ids(self): + return set([self.source_id]) + + def get_succ_ids(self): + return set() + + def get_cvs_symbol_ids_opened(self): + return [] + + def check_links(self, cvs_file_items): + source = cvs_file_items.get(self.source_id) + + assert self.id in source.tag_ids + if isinstance(source, CVSRevision): + assert self.source_lod == source.lod + elif isinstance(source, CVSBranch): + assert self.source_lod == source.symbol + else: + assert False + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s:%s<%x>' \ + % (self.cvs_file, self.symbol, self.id,) + + +class CVSTagNoop(CVSTag): + """A CVSTag whose source is a CVSRevisionAbsent.""" + + __slots__ = [] + + +# A map +# +# {nondead(source_cvs_rev) : cvs_tag_subtype} +# +# , where nondead() means that the cvs revision exists and is not +# 'dead', and CVS_TAG_SUBTYPE is the subtype of CVSTag that should be +# used. +cvs_tag_type_map = { + False : CVSTagNoop, + True : CVSTag, + } + + diff --git a/cvs2svn_lib/cvs_item_database.py b/cvs2svn_lib/cvs_item_database.py new file mode 100644 index 0000000..f072252 --- /dev/null +++ b/cvs2svn_lib/cvs_item_database.py @@ -0,0 +1,248 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains a database that can store arbitrary CVSItems.""" + + +import re +import cPickle + +from cvs2svn_lib.cvs_item import CVSRevisionAdd +from cvs2svn_lib.cvs_item import CVSRevisionChange +from cvs2svn_lib.cvs_item import CVSRevisionDelete +from cvs2svn_lib.cvs_item import CVSRevisionNoop +from cvs2svn_lib.cvs_item import CVSBranch +from cvs2svn_lib.cvs_item import CVSBranchNoop +from cvs2svn_lib.cvs_item import CVSTag +from cvs2svn_lib.cvs_item import CVSTagNoop +from cvs2svn_lib.cvs_file_items import CVSFileItems +from cvs2svn_lib.serializer import Serializer +from cvs2svn_lib.serializer import PrimedPickleSerializer +from cvs2svn_lib.database import IndexedStore + + +cvs_item_primer = ( + CVSRevisionAdd, CVSRevisionChange, + CVSRevisionDelete, CVSRevisionNoop, + CVSBranch, CVSBranchNoop, + CVSTag, CVSTagNoop, + ) + + +class NewCVSItemStore: + """A file of sequential CVSItems, grouped by CVSFile. + + The file consists of a sequence of pickles. The zeroth one is a + Serializer as described in the serializer module. Subsequent ones + are pickled lists of CVSItems, each list containing all of the + CVSItems for a single file. + + We don't use a single pickler for all items because the memo would + grow too large.""" + + def __init__(self, filename): + """Initialize an instance, creating the file and writing the primer.""" + + self.f = open(filename, 'wb') + + self.serializer = PrimedPickleSerializer( + cvs_item_primer + (CVSFileItems,) + ) + cPickle.dump(self.serializer, self.f, -1) + + def add(self, cvs_file_items): + """Write CVS_FILE_ITEMS into the database.""" + + self.serializer.dumpf(self.f, cvs_file_items) + + def close(self): + self.f.close() + self.f = None + + +class OldCVSItemStore: + """Read a file created by NewCVSItemStore. + + The file must be read sequentially, one CVSFileItems instance at a + time.""" + + def __init__(self, filename): + self.f = open(filename, 'rb') + + # Read the memo from the first pickle: + self.serializer = cPickle.load(self.f) + + def iter_cvs_file_items(self): + """Iterate through the CVSFileItems instances, one file at a time. + + Each time yield a CVSFileItems instance for one CVSFile.""" + + try: + while True: + yield self.serializer.loadf(self.f) + except EOFError: + return + + def close(self): + self.f.close() + self.f = None + + +class LinewiseSerializer(Serializer): + """A serializer that writes exactly one line for each object. + + The actual serialization is done by a wrapped serializer; this class + only escapes any newlines in the serialized data then appends a + single newline.""" + + def __init__(self, wrapee): + self.wrapee = wrapee + + @staticmethod + def _encode_newlines(s): + """Return s with newlines and backslashes encoded. + + The string is returned with the following character transformations: + + LF -> \n + CR -> \r + ^Z -> \z (needed for Windows) + \ -> \\ + + """ + + return s.replace('\\', '\\\\') \ + .replace('\n', '\\n') \ + .replace('\r', '\\r') \ + .replace('\x1a', '\\z') + + _escape_re = re.compile(r'(\\\\|\\n|\\r|\\z)') + _subst = {'\\n' : '\n', '\\r' : '\r', '\\z' : '\x1a', '\\\\' : '\\'} + + @staticmethod + def _decode_newlines(s): + """Return s with newlines and backslashes decoded. + + This function reverses the encoding of _encode_newlines(). + + """ + + def repl(m): + return LinewiseSerializer._subst[m.group(1)] + + return LinewiseSerializer._escape_re.sub(repl, s) + + def dumpf(self, f, object): + f.write(self.dumps(object)) + + def dumps(self, object): + return self._encode_newlines(self.wrapee.dumps(object)) + '\n' + + def loadf(self, f): + return self.loads(f.readline()) + + def loads(self, s): + return self.wrapee.loads(self._decode_newlines(s[:-1])) + + +class NewSortableCVSRevisionDatabase(object): + """A serially-accessible, sortable file for holding CVSRevisions. + + This class creates such files.""" + + def __init__(self, filename, serializer): + self.f = open(filename, 'w') + self.serializer = LinewiseSerializer(serializer) + + def add(self, cvs_rev): + self.f.write( + '%x %08x %s' % ( + cvs_rev.metadata_id, cvs_rev.timestamp, + self.serializer.dumps(cvs_rev), + ) + ) + + def close(self): + self.f.close() + self.f = None + + +class OldSortableCVSRevisionDatabase(object): + """A serially-accessible, sortable file for holding CVSRevisions. + + This class reads such files.""" + + def __init__(self, filename, serializer): + self.filename = filename + self.serializer = LinewiseSerializer(serializer) + + def __iter__(self): + f = open(self.filename, 'r') + for l in f: + s = l.split(' ', 2)[-1] + yield self.serializer.loads(s) + f.close() + + def close(self): + pass + + +class NewSortableCVSSymbolDatabase(object): + """A serially-accessible, sortable file for holding CVSSymbols. + + This class creates such files.""" + + def __init__(self, filename, serializer): + self.f = open(filename, 'w') + self.serializer = LinewiseSerializer(serializer) + + def add(self, cvs_symbol): + self.f.write( + '%x %s' % (cvs_symbol.symbol.id, self.serializer.dumps(cvs_symbol)) + ) + + def close(self): + self.f.close() + self.f = None + + +class OldSortableCVSSymbolDatabase(object): + """A serially-accessible, sortable file for holding CVSSymbols. + + This class reads such files.""" + + def __init__(self, filename, serializer): + self.filename = filename + self.serializer = LinewiseSerializer(serializer) + + def __iter__(self): + f = open(self.filename, 'r') + for l in f: + s = l.split(' ', 1)[-1] + yield self.serializer.loads(s) + f.close() + + def close(self): + pass + + +def IndexedCVSItemStore(filename, index_filename, mode): + return IndexedStore( + filename, index_filename, mode, + PrimedPickleSerializer(cvs_item_primer) + ) + + diff --git a/cvs2svn_lib/cvs_revision_manager.py b/cvs2svn_lib/cvs_revision_manager.py new file mode 100644 index 0000000..6f5de3b --- /dev/null +++ b/cvs2svn_lib/cvs_revision_manager.py @@ -0,0 +1,85 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Access the CVS repository via CVS's 'cvs' command.""" + + +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.process import check_command_runs +from cvs2svn_lib.process import PipeStream +from cvs2svn_lib.process import CommandFailedException +from cvs2svn_lib.revision_manager import RevisionReader + + +class CVSRevisionReader(RevisionReader): + """A RevisionReader that reads the contents via CVS.""" + + # Different versions of CVS support different global arguments. + # Here are the global arguments that we try to use, in order of + # decreasing preference: + _possible_global_arguments = [ + ['-q', '-R', '-f'], + ['-q', '-R'], + ['-q', '-f'], + ['-q'], + ] + + def __init__(self, cvs_executable): + self.cvs_executable = cvs_executable + + for global_arguments in self._possible_global_arguments: + try: + self._check_cvs_runs(global_arguments) + except CommandFailedException, e: + pass + else: + # Those global arguments were OK; use them for all CVS invocations. + self.global_arguments = global_arguments + break + else: + raise FatalError( + '%s\n' + 'Please check that cvs is installed and in your PATH.' % (e,) + ) + + def _check_cvs_runs(self, global_arguments): + """Check that CVS can be started. + + Try running 'cvs --version' with the current setting for + self.cvs_executable and the specified global_arguments. If not + successful, raise a CommandFailedException.""" + + check_command_runs( + [self.cvs_executable] + global_arguments + ['--version'], + self.cvs_executable, + ) + + def get_content_stream(self, cvs_rev, suppress_keyword_substitution=False): + project = cvs_rev.cvs_file.project + pipe_cmd = [ + self.cvs_executable + ] + self.global_arguments + [ + '-d', project.cvs_repository_root, + 'co', + '-r' + cvs_rev.rev, + '-p' + ] + if suppress_keyword_substitution: + pipe_cmd.append('-kk') + pipe_cmd.append(project.cvs_module + cvs_rev.cvs_path) + return PipeStream(pipe_cmd) + + diff --git a/cvs2svn_lib/database.py b/cvs2svn_lib/database.py new file mode 100644 index 0000000..9db9be2 --- /dev/null +++ b/cvs2svn_lib/database.py @@ -0,0 +1,322 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains database facilities used by cvs2svn.""" + + +import sys +import os +import cPickle + +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.common import DB_OPEN_WRITE +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.log import Log +from cvs2svn_lib.record_table import FileOffsetPacker +from cvs2svn_lib.record_table import RecordTable + + +# DBM module selection + +# 1. If we have bsddb3, it is probably newer than bsddb. Fake bsddb = bsddb3, +# so that the dbhash module used by anydbm will use bsddb3. +try: + import bsddb3 + sys.modules['bsddb'] = sys.modules['bsddb3'] +except ImportError: + pass + +# 2. These DBM modules are not good for cvs2svn. +import anydbm +if anydbm._defaultmod.__name__ in ['dumbdbm', 'dbm']: + Log().error( + '%s: cvs2svn uses the anydbm package, which depends on lower level ' + 'dbm\n' + 'libraries. Your system has %s, with which cvs2svn is known to have\n' + 'problems. To use cvs2svn, you must install a Python dbm library ' + 'other than\n' + 'dumbdbm or dbm. See ' + 'http://python.org/doc/current/lib/module-anydbm.html\n' + 'for more information.\n' + % (error_prefix, anydbm._defaultmod.__name__,) + ) + sys.exit(1) + +# 3. If we are using the old bsddb185 module, then try prefer gdbm instead. +# Unfortunately, gdbm appears not to be trouble free, either. +if hasattr(anydbm._defaultmod, 'bsddb') \ + and not hasattr(anydbm._defaultmod.bsddb, '__version__'): + try: + gdbm = __import__('gdbm') + except ImportError: + Log().warn( + '%s: The version of the bsddb module found on your computer ' + 'has been\n' + 'reported to malfunction on some datasets, causing KeyError ' + 'exceptions.\n' + % (warning_prefix,) + ) + else: + anydbm._defaultmod = gdbm + + +class Database: + """A database that uses a Serializer to store objects of a certain type. + + The serializer is stored in the database under the key + self.serializer_key. (This implies that self.serializer_key may not + be used as a key for normal entries.) + + The backing database is an anydbm-based DBM. + + """ + + serializer_key = '_.%$1\t;_ ' + + def __init__(self, filename, mode, serializer=None): + """Constructor. + + The database stores its Serializer, so none needs to be supplied + when opening an existing database.""" + + # pybsddb3 has a bug which prevents it from working with + # Berkeley DB 4.2 if you open the db with 'n' ("new"). This + # causes the DB_TRUNCATE flag to be passed, which is disallowed + # for databases protected by lock and transaction support + # (bsddb databases use locking from bsddb version 4.2.4 onwards). + # + # Therefore, manually perform the removal (we can do this, because + # we know that for bsddb - but *not* anydbm in general - the database + # consists of one file with the name we specify, rather than several + # based on that name). + if mode == DB_OPEN_NEW and anydbm._defaultmod.__name__ == 'dbhash': + if os.path.isfile(filename): + os.unlink(filename) + self.db = anydbm.open(filename, 'c') + else: + self.db = anydbm.open(filename, mode) + + # Import implementations for many mapping interface methods. + for meth_name in ('__delitem__', + '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'): + meth_ref = getattr(self.db, meth_name, None) + if meth_ref: + setattr(self, meth_name, meth_ref) + + if mode == DB_OPEN_NEW: + self.serializer = serializer + self.db[self.serializer_key] = cPickle.dumps(self.serializer) + else: + self.serializer = cPickle.loads(self.db[self.serializer_key]) + + def __getitem__(self, key): + return self.serializer.loads(self.db[key]) + + def __setitem__(self, key, value): + self.db[key] = self.serializer.dumps(value) + + def __delitem__(self, key): + # gdbm defines a __delitem__ method, but it cannot be assigned. So + # this method provides a fallback definition via explicit delegation: + del self.db[key] + + def keys(self): + retval = self.db.keys() + retval.remove(self.serializer_key) + return retval + + def __iter__(self): + for key in self.keys(): + yield key + + def has_key(self, key): + try: + self.db[key] + return True + except KeyError: + return False + + def __contains__(self, key): + return self.has_key(key) + + def iterkeys(self): + return self.__iter__() + + def clear(self): + for key in self.keys(): + del self[key] + + def items(self): + return [(key, self[key],) for key in self.keys()] + + def values(self): + return [self[key] for key in self.keys()] + + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + + def close(self): + self.db.close() + self.db = None + + +class IndexedDatabase: + """A file of objects that are written sequentially and read randomly. + + The objects are indexed by small non-negative integers, and a + RecordTable is used to store the index -> fileoffset map. + fileoffset=0 is used to represent an empty record. (An offset of 0 + cannot occur for a legitimate record because the serializer is + written there.) + + The main file consists of a sequence of pickles (or other serialized + data format). The zeroth record is a pickled Serializer. + Subsequent ones are objects serialized using the serializer. The + offset of each object in the file is stored to an index table so + that the data can later be retrieved randomly. + + Objects are always stored to the end of the file. If an object is + deleted or overwritten, the fact is recorded in the index_table but + the space in the pickle file is not garbage collected. This has the + advantage that one can create a modified version of a database that + shares the main data file with an old version by copying the index + file. But it has the disadvantage that space is wasted whenever + objects are written multiple times.""" + + def __init__(self, filename, index_filename, mode, serializer=None): + """Initialize an IndexedDatabase, writing the serializer if necessary. + + SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the + serializer is read from the file.""" + + self.filename = filename + self.index_filename = index_filename + self.mode = mode + if self.mode == DB_OPEN_NEW: + self.f = open(self.filename, 'wb+') + elif self.mode == DB_OPEN_WRITE: + self.f = open(self.filename, 'rb+') + elif self.mode == DB_OPEN_READ: + self.f = open(self.filename, 'rb') + else: + raise RuntimeError('Invalid mode %r' % self.mode) + + self.index_table = RecordTable( + self.index_filename, self.mode, FileOffsetPacker() + ) + + if self.mode == DB_OPEN_NEW: + assert serializer is not None + self.serializer = serializer + cPickle.dump(self.serializer, self.f, -1) + else: + # Read the memo from the first pickle: + self.serializer = cPickle.load(self.f) + + # Seek to the end of the file, and record that position: + self.f.seek(0, 2) + self.fp = self.f.tell() + self.eofp = self.fp + + def __setitem__(self, index, item): + """Write ITEM into the database indexed by INDEX.""" + + # Make sure we're at the end of the file: + if self.fp != self.eofp: + self.f.seek(self.eofp) + self.index_table[index] = self.eofp + s = self.serializer.dumps(item) + self.f.write(s) + self.eofp += len(s) + self.fp = self.eofp + + def _fetch(self, offset): + if self.fp != offset: + self.f.seek(offset) + + # There is no easy way to tell how much data will be read, so just + # indicate that we don't know the current file pointer: + self.fp = None + + return self.serializer.loadf(self.f) + + def iterkeys(self): + return self.index_table.iterkeys() + + def itervalues(self): + for offset in self.index_table.itervalues(): + yield self._fetch(offset) + + def __getitem__(self, index): + offset = self.index_table[index] + return self._fetch(offset) + + def get(self, item, default=None): + try: + return self[item] + except KeyError: + return default + + def get_many(self, indexes, default=None): + """Yield (index,item) tuples for INDEXES, in arbitrary order. + + Yield (index,default) for indexes with no defined values.""" + + offsets = [] + for (index, offset) in self.index_table.get_many(indexes): + if offset is None: + yield (index, default) + else: + offsets.append((offset, index)) + + # Sort the offsets to reduce disk seeking: + offsets.sort() + for (offset,index) in offsets: + yield (index, self._fetch(offset)) + + def __delitem__(self, index): + # We don't actually free the data in self.f. + del self.index_table[index] + + def close(self): + self.index_table.close() + self.index_table = None + self.f.close() + self.f = None + + def __str__(self): + return 'IndexedDatabase(%r)' % (self.filename,) + + +class IndexedStore(IndexedDatabase): + """A file of items that is written sequentially and read randomly. + + This is just like IndexedDatabase, except that it has an additional + add() method which assumes that the object to be written to the + database has an 'id' member, which is used as its database index. + See IndexedDatabase for more information.""" + + def add(self, item): + """Write ITEM into the database indexed by ITEM.id.""" + + self[item.id] = item + + diff --git a/cvs2svn_lib/dumpfile_delegate.py b/cvs2svn_lib/dumpfile_delegate.py new file mode 100644 index 0000000..092cfca --- /dev/null +++ b/cvs2svn_lib/dumpfile_delegate.py @@ -0,0 +1,510 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains database facilities used by cvs2svn.""" + + +try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 + + +from cvs2svn_lib import config +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.common import path_split +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.cvs_file import CVSDirectory +from cvs2svn_lib.cvs_file import CVSFile +from cvs2svn_lib.svn_repository_delegate import SVNRepositoryDelegate +from cvs2svn_lib.apple_single_filter import get_maybe_apple_single_stream + + +# Things that can happen to a file. +OP_ADD = 'add' +OP_CHANGE = 'change' + + +class DumpfileDelegate(SVNRepositoryDelegate): + """Create a Subversion dumpfile.""" + + def __init__(self, revision_reader, dumpfile_path): + """Return a new DumpfileDelegate instance, attached to a dumpfile + DUMPFILE_PATH, using Ctx().cvs_filename_decoder().""" + + self._revision_reader = revision_reader + self.dumpfile_path = dumpfile_path + + self.dumpfile = open(self.dumpfile_path, 'wb') + self._write_dumpfile_header(self.dumpfile) + + # A set of the basic project infrastructure project directories + # that have been created so far, as SVN paths. (The root + # directory is considered to be present at initialization.) This + # includes all of the LOD paths, and all of their parent + # directories etc. + self._basic_directories = set(['']) + + def _write_dumpfile_header(self, dumpfile): + # Initialize the dumpfile with the standard headers. + # + # Since the CVS repository doesn't have a UUID, and the Subversion + # repository will be created with one anyway, we don't specify a + # UUID in the dumpflie + dumpfile.write('SVN-fs-dump-format-version: 2\n\n') + + def _utf8_path(self, path): + """Return a copy of PATH encoded in UTF-8.""" + + # Convert each path component separately (as they may each use + # different encodings). + try: + return '/'.join([ + Ctx().cvs_filename_decoder(piece).encode('utf8') + for piece in path.split('/') + ]) + except UnicodeError: + raise FatalError( + "Unable to convert a path '%s' to internal encoding.\n" + "Consider rerunning with one or more '--encoding' parameters or\n" + "with '--fallback-encoding'." + % (path,)) + + def _string_for_prop(self, name, value): + """Return a property in the form needed for the dumpfile.""" + + return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value) + + def start_commit(self, revnum, revprops): + """Emit the start of SVN_COMMIT (an SVNCommit).""" + + self.revision = revnum + + # The start of a new commit typically looks like this: + # + # Revision-number: 1 + # Prop-content-length: 129 + # Content-length: 129 + # + # K 7 + # svn:log + # V 27 + # Log message for revision 1. + # K 10 + # svn:author + # V 7 + # jrandom + # K 8 + # svn:date + # V 27 + # 2003-04-22T22:57:58.132837Z + # PROPS-END + # + # Notice that the length headers count everything -- not just the + # length of the data but also the lengths of the lengths, including + # the 'K ' or 'V ' prefixes. + # + # The reason there are both Prop-content-length and Content-length + # is that the former includes just props, while the latter includes + # everything. That's the generic header form for any entity in a + # dumpfile. But since revisions only have props, the two lengths + # are always the same for revisions. + + # Calculate the output needed for the property definitions. + prop_names = revprops.keys() + prop_names.sort() + prop_strings = [] + for propname in prop_names: + if revprops[propname] is not None: + prop_strings.append( + self._string_for_prop(propname, revprops[propname])) + + all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n' + total_len = len(all_prop_strings) + + # Print the revision header and revprops + self.dumpfile.write( + 'Revision-number: %d\n' + 'Prop-content-length: %d\n' + 'Content-length: %d\n' + '\n' + '%s' + '\n' + % (self.revision, total_len, total_len, all_prop_strings) + ) + + def end_commit(self): + pass + + def _make_any_dir(self, path): + """Emit the creation of directory PATH.""" + + self.dumpfile.write( + "Node-path: %s\n" + "Node-kind: dir\n" + "Node-action: add\n" + "\n" + "\n" + % self._utf8_path(path) + ) + + def _register_basic_directory(self, path, create): + """Register the creation of PATH if it is not already there. + + Create any parent directories that do not already exist. If + CREATE is set, also create PATH if it doesn't already exist. This + method should only be used for the LOD paths and the directories + containing them, not for directories within an LOD path.""" + + if path not in self._basic_directories: + # Make sure that the parent directory is present: + self._register_basic_directory(path_split(path)[0], True) + if create: + self._make_any_dir(path) + self._basic_directories.add(path) + + def initialize_project(self, project): + """Create any initial directories for the project. + + The trunk, tags, and branches directories directories are created + the first time the project is seen. Be sure not to create parent + directories that already exist (e.g., because two directories + share part of their paths either within or across projects).""" + + for path in project.get_initial_directories(): + self._register_basic_directory(path, True) + + def initialize_lod(self, lod): + lod_path = lod.get_path() + if lod_path: + self._register_basic_directory(lod_path, True) + + def mkdir(self, lod, cvs_directory): + self._make_any_dir(lod.get_path(cvs_directory.cvs_path)) + + def _add_or_change_path(self, s_item, op): + """Emit the addition or change corresponding to S_ITEM. + + OP is either the constant OP_ADD or OP_CHANGE.""" + + assert op in [OP_ADD, OP_CHANGE] + + # Convenience variables + cvs_rev = s_item.cvs_rev + + # The property handling here takes advantage of an undocumented + # but IMHO consistent feature of the Subversion dumpfile-loading + # code. When a node's properties aren't mentioned (that is, the + # "Prop-content-length:" header is absent, no properties are + # listed at all, and there is no "PROPS-END\n" line) then no + # change is made to the node's properties. + # + # This is consistent with the way dumpfiles behave w.r.t. text + # content changes, so I'm comfortable relying on it. If you + # commit a change to *just* the properties of some node that + # already has text contents from a previous revision, then in the + # dumpfile output for the prop change, no "Text-content-length:" + # nor "Text-content-md5:" header will be present, and the text of + # the file will not be given. But this does not cause the file's + # text to be erased! It simply remains unchanged. + # + # This works out great for cvs2svn, due to lucky coincidences: + # + # For files, the only properties we ever set are set in the first + # revision; all other revisions (including on branches) inherit + # from that. After the first revision, we never change file + # properties, therefore, there is no need to remember the full set + # of properties on a given file once we've set it. + # + # For directories, the only property we set is "svn:ignore", and + # while we may change it after the first revision, we always do so + # based on the contents of a ".cvsignore" file -- in other words, + # CVS is doing the remembering for us, so we still don't have to + # preserve the previous value of the property ourselves. + + # Calculate the (sorted-by-name) property string and length, if any. + if s_item.svn_props_changed: + svn_props = s_item.svn_props + prop_contents = '' + prop_names = svn_props.keys() + prop_names.sort() + for pname in prop_names: + pvalue = svn_props[pname] + if pvalue is not None: + prop_contents += self._string_for_prop(pname, pvalue) + prop_contents += 'PROPS-END\n' + props_header = 'Prop-content-length: %d\n' % len(prop_contents) + else: + prop_contents = '' + props_header = '' + + # If the file has keywords, we must prevent CVS/RCS from expanding + # the keywords because they must be unexpanded in the repository, + # or Subversion will get confused. + stream = self._revision_reader.get_content_stream( + cvs_rev, suppress_keyword_substitution=s_item.has_keywords() + ) + + if Ctx().decode_apple_single: + # Insert a filter to decode any files that are in AppleSingle + # format: + stream = get_maybe_apple_single_stream(stream) + + # Insert a filter to convert all EOLs to LFs if neccessary + + eol_style = s_item.svn_props.get('svn:eol-style', None) + if eol_style: + stream = LF_EOL_Filter(stream, eol_style) + + buf = None + + # treat .cvsignore as a directory property + dir_path, basename = path_split(cvs_rev.get_svn_path()) + if basename == '.cvsignore': + buf = stream.read() + ignore_vals = generate_ignores(buf) + ignore_contents = '\n'.join(ignore_vals) + if ignore_contents: + ignore_contents += '\n' + ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \ + (len(ignore_contents), ignore_contents)) + ignore_contents += 'PROPS-END\n' + ignore_len = len(ignore_contents) + + # write headers, then props + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-kind: dir\n' + 'Node-action: change\n' + 'Prop-content-length: %d\n' + 'Content-length: %d\n' + '\n' + '%s' + % (self._utf8_path(dir_path), + ignore_len, ignore_len, ignore_contents) + ) + if not Ctx().keep_cvsignore: + stream.close() + return + + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-kind: file\n' + 'Node-action: %s\n' + '%s' # no property header if no props + % (self._utf8_path(cvs_rev.get_svn_path()), op, props_header) + ) + + pos = self.dumpfile.tell() + + content_header_fmt = ( + 'Text-content-length: %16d\n' + 'Text-content-md5: %32s\n' + 'Content-length: %16d\n' + '\n' + ) + + self.dumpfile.write(content_header_fmt % (0, '', 0,)) + + if prop_contents: + self.dumpfile.write(prop_contents) + + # Insert the rev contents, calculating length and checksum as we go. + checksum = md5() + length = 0 + if buf is None: + buf = stream.read(config.PIPE_READ_SIZE) + while buf != '': + checksum.update(buf) + length += len(buf) + self.dumpfile.write(buf) + buf = stream.read(config.PIPE_READ_SIZE) + + stream.close() + + # Go back to overwrite the length and checksum headers with the + # correct values. The content length is the length of property + # data, text data, and any metadata around/inside around them: + self.dumpfile.seek(pos, 0) + self.dumpfile.write( + content_header_fmt + % (length, checksum.hexdigest(), length + len(prop_contents),) + ) + + # Jump back to the end of the stream + self.dumpfile.seek(0, 2) + + # This record is done (write two newlines -- one to terminate + # contents that weren't themselves newline-termination, one to + # provide a blank line for readability. + self.dumpfile.write('\n\n') + + def add_path(self, s_item): + """Emit the addition corresponding to S_ITEM, an SVNCommitItem.""" + + self._add_or_change_path(s_item, OP_ADD) + + def change_path(self, s_item): + """Emit the change corresponding to S_ITEM, an SVNCommitItem.""" + + self._add_or_change_path(s_item, OP_CHANGE) + + def delete_lod(self, lod): + """Emit the deletion of LOD.""" + + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-action: delete\n' + '\n' + % (self._utf8_path(lod.get_path()),) + ) + self._basic_directories.remove(lod.get_path()) + + def delete_path(self, lod, cvs_path): + dir_path, basename = path_split(lod.get_path(cvs_path.get_cvs_path())) + if basename == '.cvsignore': + # When a .cvsignore file is deleted, the directory's svn:ignore + # property needs to be deleted. + ignore_contents = 'PROPS-END\n' + ignore_len = len(ignore_contents) + + # write headers, then props + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-kind: dir\n' + 'Node-action: change\n' + 'Prop-content-length: %d\n' + 'Content-length: %d\n' + '\n' + '%s' + % (self._utf8_path(dir_path), + ignore_len, ignore_len, ignore_contents) + ) + if not Ctx().keep_cvsignore: + return + + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-action: delete\n' + '\n' + % (self._utf8_path(lod.get_path(cvs_path.cvs_path)),) + ) + + def copy_lod(self, src_lod, dest_lod, src_revnum): + # Register the main LOD directory, and create parent directories + # as needed: + self._register_basic_directory(dest_lod.get_path(), False) + + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-kind: dir\n' + 'Node-action: add\n' + 'Node-copyfrom-rev: %d\n' + 'Node-copyfrom-path: %s\n' + '\n' + % (self._utf8_path(dest_lod.get_path()), + src_revnum, self._utf8_path(src_lod.get_path())) + ) + + def copy_path(self, cvs_path, src_lod, dest_lod, src_revnum): + if isinstance(cvs_path, CVSFile): + node_kind = 'file' + if cvs_path.basename == '.cvsignore': + # FIXME: Here we have to adjust the containing directory's + # svn:ignore property to reflect the addition of the + # .cvsignore file to the LOD! This is awkward because we + # don't have the contents of the .cvsignore file available. + if not Ctx().keep_cvsignore: + return + elif isinstance(cvs_path, CVSDirectory): + node_kind = 'dir' + else: + raise InternalError() + + self.dumpfile.write( + 'Node-path: %s\n' + 'Node-kind: %s\n' + 'Node-action: add\n' + 'Node-copyfrom-rev: %d\n' + 'Node-copyfrom-path: %s\n' + '\n' + % ( + self._utf8_path(dest_lod.get_path(cvs_path.cvs_path)), + node_kind, + src_revnum, + self._utf8_path(src_lod.get_path(cvs_path.cvs_path)) + ) + ) + + def finish(self): + """Perform any cleanup necessary after all revisions have been + committed.""" + + self.dumpfile.close() + + +def generate_ignores(raw_ignore_val): + ignore_vals = [ ] + for ignore in raw_ignore_val.split(): + # Reset the list if we encounter a '!' + # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore + if ignore == '!': + ignore_vals = [ ] + else: + ignore_vals.append(ignore) + return ignore_vals + + +class LF_EOL_Filter: + """Filter a stream and convert all end-of-line markers (CRLF, CR or LF) + into the appropriate canonical eol style.""" + + eol_style_replacements = { + 'LF' : '\n', + 'CR' : '\r', + 'CRLF' : '\r\n', + 'native' : '\n', + } + + def __init__(self, stream, eol_style): + self.stream = stream + self.replacement = self.eol_style_replacements[eol_style] + self.carry_cr = False + self.eof = False + + def read(self, size=-1): + while True: + buf = self.stream.read(size) + self.eof = len(buf) == 0 + if self.carry_cr: + buf = '\r' + buf + self.carry_cr = False + if not self.eof and buf[-1] == '\r': + self.carry_cr = True + buf = buf[:-1] + buf = buf.replace('\r\n', '\n') + buf = buf.replace('\r', '\n') + if self.replacement != '\n': + buf = buf.replace('\n', self.replacement) + if buf or self.eof: + return buf + + def close(self): + self.stream.close() + self.stream = None + + diff --git a/cvs2svn_lib/fill_source.py b/cvs2svn_lib/fill_source.py new file mode 100644 index 0000000..2bb8e4c --- /dev/null +++ b/cvs2svn_lib/fill_source.py @@ -0,0 +1,192 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes describing the sources of symbol fills.""" + + +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import SVN_INVALID_REVNUM +from cvs2svn_lib.svn_revision_range import SVNRevisionRange +from cvs2svn_lib.svn_revision_range import RevisionScores + + +class FillSource: + """Representation of a fill source. + + A FillSource keeps track of the paths that have to be filled in a + particular symbol fill. + + This class holds a SVNRevisionRange instance for each CVSFile that + has to be filled within the subtree of the repository rooted at + self.cvs_path. The SVNRevisionRange objects are stored in a tree + in which the directory nodes are dictionaries mapping CVSPaths to + subnodes and the leaf nodes are the SVNRevisionRange objects telling + for what source_lod and what range of revisions the leaf could serve + as a source. + + FillSource objects are able to compute the score for arbitrary + source LODs and source revision numbers. + + These objects are used by the symbol filler in SVNOutputOption.""" + + def __init__(self, cvs_path, symbol, node_tree): + """Create a fill source. + + The best LOD and SVN REVNUM to use as the copy source can be + determined by calling compute_best_source(). + + Members: + + cvs_path -- (CVSPath): the CVSPath described by this FillSource. + + _symbol -- (Symbol) the symbol to be filled. + + _node_tree -- (dict) a tree stored as a map { CVSPath : node }, + where subnodes have the same form. Leaves are + SVNRevisionRange instances telling the source_lod and range + of SVN revision numbers from which the CVSPath can be + copied. + + """ + + self.cvs_path = cvs_path + self._symbol = symbol + self._node_tree = node_tree + + def _set_node(self, cvs_file, svn_revision_range): + parent_node = self._get_node(cvs_file.parent_directory, create=True) + if cvs_file in parent_node: + raise InternalError( + '%s appeared twice in sources for %s' % (cvs_file, self._symbol) + ) + parent_node[cvs_file] = svn_revision_range + + def _get_node(self, cvs_path, create=False): + if cvs_path == self.cvs_path: + return self._node_tree + else: + parent_node = self._get_node(cvs_path.parent_directory, create=create) + try: + return parent_node[cvs_path] + except KeyError: + if create: + node = {} + parent_node[cvs_path] = node + return node + else: + raise + + def compute_best_source(self, preferred_source): + """Determine the best source_lod and subversion revision number to copy. + + Return the best source found, as an SVNRevisionRange instance. If + PREFERRED_SOURCE is not None and its opening is among the sources + with the best scores, return it; otherwise, return the oldest such + revision on the first such source_lod (ordered by the natural LOD + sort order). The return value's source_lod is the best LOD to + copy from, and its opening_revnum is the best SVN revision.""" + + # Aggregate openings and closings from our rev tree + svn_revision_ranges = self._get_revision_ranges(self._node_tree) + + # Score the lists + revision_scores = RevisionScores(svn_revision_ranges) + + best_source_lod, best_revnum, best_score = \ + revision_scores.get_best_revnum() + + if ( + preferred_source is not None + and revision_scores.get_score(preferred_source) == best_score + ): + best_source_lod = preferred_source.source_lod + best_revnum = preferred_source.opening_revnum + + if best_revnum == SVN_INVALID_REVNUM: + raise FatalError( + "failed to find a revision to copy from when copying %s" + % self._symbol.name + ) + + return SVNRevisionRange(best_source_lod, best_revnum) + + def _get_revision_ranges(self, node): + """Return a list of all the SVNRevisionRanges at and under NODE. + + Include duplicates. This is a helper method used by + compute_best_source().""" + + if isinstance(node, SVNRevisionRange): + # It is a leaf node. + return [ node ] + else: + # It is an intermediate node. + revision_ranges = [] + for key, subnode in node.items(): + revision_ranges.extend(self._get_revision_ranges(subnode)) + return revision_ranges + + def get_subsources(self): + """Generate (CVSPath, FillSource) for all direct subsources.""" + + if not isinstance(self._node_tree, SVNRevisionRange): + for cvs_path, node in self._node_tree.items(): + fill_source = FillSource(cvs_path, self._symbol, node) + yield (cvs_path, fill_source) + + def get_subsource_map(self): + """Return the map {CVSPath : FillSource} of direct subsources.""" + + src_entries = {} + + for (cvs_path, fill_subsource) in self.get_subsources(): + src_entries[cvs_path] = fill_subsource + + return src_entries + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s(%s:%s)' % ( + self.__class__.__name__, self._symbol, self.cvs_path, + ) + + def __repr__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s%r' % (self, self._node_tree,) + + +def get_source_set(symbol, range_map): + """Return a FillSource describing the fill sources for RANGE_MAP. + + SYMBOL is either a Branch or a Tag. RANGE_MAP is a map { CVSSymbol + : SVNRevisionRange } as returned by + SymbolingsReader.get_range_map(). + + Use the SVNRevisionRanges from RANGE_MAP to create a FillSource + instance describing the sources for filling SYMBOL.""" + + root_cvs_directory = symbol.project.get_root_cvs_directory() + fill_source = FillSource(root_cvs_directory, symbol, {}) + + for cvs_symbol, svn_revision_range in range_map.items(): + fill_source._set_node(cvs_symbol.cvs_file, svn_revision_range) + + return fill_source + + diff --git a/cvs2svn_lib/fulltext_revision_recorder.py b/cvs2svn_lib/fulltext_revision_recorder.py new file mode 100644 index 0000000..ad057b7 --- /dev/null +++ b/cvs2svn_lib/fulltext_revision_recorder.py @@ -0,0 +1,127 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""An abstract class that contructs file contents during CollectRevsPass. + +It calls its record_fulltext() method with the full text of every +revision. This method should be overridden to do something with the +fulltext and possibly return a revision_recorder_token.""" + + +from cvs2svn_lib.revision_manager import RevisionRecorder + + +class FulltextRevisionRecorder: + """Similar to a RevisionRecorder, but it requires the fulltext.""" + + def register_artifacts(self, which_pass): + pass + + def start(self): + pass + + def start_file(self, cvs_file_items): + pass + + def record_fulltext(self, cvs_rev, log, fulltext): + """Record the fulltext for CVS_REV. + + CVS_REV has the log message LOG and the fulltext FULLTEXT. This + method should be overridden to do something sensible with them.""" + + raise NotImplementedError() + + def finish_file(self, cvs_file_items): + pass + + def finish(self): + pass + + +class FulltextRevisionRecorderAdapter(RevisionRecorder): + """Reconstruct the fulltext and pass it to a FulltextRevisionRecorder. + + This class implements RevisionRecorder (so it can be passed directly + to CollectRevsPass). But it doesn't actually record anything. + Instead, it reconstructs the fulltext of each revision, and passes + the fulltext to a fulltext_revision_recorder.""" + + def __init__(self, fulltext_revision_recorder): + RevisionRecorder.__init__(self) + self.fulltext_revision_recorder = fulltext_revision_recorder + + def register_artifacts(self, which_pass): + self.fulltext_revision_recorder.register_artifacts(which_pass) + + def start(self): + self.fulltext_revision_recorder.start() + + def start_file(self, cvs_file_items): + self.fulltext_revision_recorder.start_file(cvs_file_items) + + def record_text(self, cvs_rev, log, text): + """This method should be overwridden. + + It should determine the fulltext of CVS_REV, then pass it to + self.fulltext_revision_recorder.record_fulltext() and return the + result.""" + + raise NotImplementedError() + + def finish_file(self, cvs_file_items): + self.fulltext_revision_recorder.finish_file(cvs_file_items) + + def finish(self): + self.fulltext_revision_recorder.finish() + + +class SimpleFulltextRevisionRecorderAdapter(FulltextRevisionRecorderAdapter): + """Reconstruct the fulltext using a RevisionReader. + + To create the fulltext, this class simply uses a RevisionReader (for + example, RCSRevisionReader or CVSRevisionReader). This is not quite + as wasteful as using one of these RevisionReaders in OutputPass, + because the same RCS file will be read over and over (and so + presumably stay in the disk cache). But it is still pretty silly, + considering that we have all the RCS deltas available to us.""" + + def __init__(self, revision_reader, fulltext_revision_recorder): + FulltextRevisionRecorderAdapter.__init__(self, fulltext_revision_recorder) + self.revision_reader = revision_reader + + def register_artifacts(self, which_pass): + FulltextRevisionRecorderAdapter.register_artifacts(self, which_pass) + self.revision_reader.register_artifacts(which_pass) + + def start(self): + FulltextRevisionRecorderAdapter.start(self) + self.revision_reader.start() + + def record_text(self, cvs_rev, log, text): + # FIXME: We have to decide what to do about keyword substitution + # and eol_style here: + fulltext = self.revision_reader.get_content_stream( + cvs_rev, suppress_keyword_substitution=False + ).read() + return self.fulltext_revision_recorder.record_fulltext( + cvs_rev, log, fulltext + ) + + def finish(self): + FulltextRevisionRecorderAdapter.finish(self) + self.revision_reader.finish() + + diff --git a/cvs2svn_lib/git_output_option.py b/cvs2svn_lib/git_output_option.py new file mode 100644 index 0000000..a1e46b9 --- /dev/null +++ b/cvs2svn_lib/git_output_option.py @@ -0,0 +1,658 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Classes for outputting the converted repository to git. + +For information about the format allowed by git-fast-import, see: + + http://www.kernel.org/pub/software/scm/git/docs/git-fast-import.html + +""" + +import bisect + +from cvs2svn_lib import config +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.openings_closings import SymbolingsReader +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag +from cvs2svn_lib.cvs_item import CVSRevisionAdd +from cvs2svn_lib.cvs_item import CVSRevisionChange +from cvs2svn_lib.cvs_item import CVSRevisionDelete +from cvs2svn_lib.cvs_item import CVSRevisionNoop +from cvs2svn_lib.cvs_item import CVSSymbol +from cvs2svn_lib.output_option import OutputOption +from cvs2svn_lib.svn_revision_range import RevisionScores +from cvs2svn_lib.repository_mirror import RepositoryMirror +from cvs2svn_lib.key_generator import KeyGenerator + + +# The branch name to use for the "tag fixup branches". The +# git-fast-import documentation suggests using 'TAG_FIXUP' (outside of +# the refs/heads namespace), but this is currently broken. Use a name +# containing '.', which is not allowed in CVS symbols, to avoid +# conflicts (though of course a conflict could still result if the +# user requests symbol transformations). +FIXUP_BRANCH_NAME = 'refs/heads/TAG.FIXUP' + + +class ExpectedDirectoryError(Exception): + """A file was found where a directory was expected.""" + + pass + + +class ExpectedFileError(Exception): + """A directory was found where a file was expected.""" + + pass + + +class GitRevisionWriter(object): + def register_artifacts(self, which_pass): + pass + + def start(self, f, mirror): + self.f = f + self._mirror = mirror + + def _modify_file(self, cvs_item, post_commit): + raise NotImplementedError() + + def _mkdir_p(self, cvs_directory, lod): + """Make sure that CVS_DIRECTORY exists in LOD. + + If not, create it. Return the node for CVS_DIRECTORY.""" + + try: + node = self._mirror.get_current_lod_directory(lod) + except KeyError: + node = self._mirror.add_lod(lod) + + for sub_path in cvs_directory.get_ancestry()[1:]: + try: + node = node[sub_path] + except KeyError: + node = node.mkdir(sub_path) + if node is None: + raise ExpectedDirectoryError( + 'File found at \'%s\' where directory was expected.' % (sub_path,) + ) + + return node + + def add_file(self, cvs_rev, post_commit): + cvs_file = cvs_rev.cvs_file + if post_commit: + lod = cvs_file.project.get_trunk() + else: + lod = cvs_rev.lod + parent_node = self._mkdir_p(cvs_file.parent_directory, lod) + parent_node.add_file(cvs_file) + self._modify_file(cvs_rev, post_commit) + + def modify_file(self, cvs_rev, post_commit): + cvs_file = cvs_rev.cvs_file + if post_commit: + lod = cvs_file.project.get_trunk() + else: + lod = cvs_rev.lod + if self._mirror.get_current_path(cvs_file, lod) is not None: + raise ExpectedFileError( + 'Directory found at \'%s\' where file was expected.' % (cvs_file,) + ) + self._modify_file(cvs_rev, post_commit) + + def delete_file(self, cvs_rev, post_commit): + cvs_file = cvs_rev.cvs_file + if post_commit: + lod = cvs_file.project.get_trunk() + else: + lod = cvs_rev.lod + parent_node = self._mirror.get_current_path( + cvs_file.parent_directory, lod + ) + if parent_node[cvs_file] is not None: + raise ExpectedFileError( + 'Directory found at \'%s\' where file was expected.' % (cvs_file,) + ) + del parent_node[cvs_file] + self.f.write('D %s\n' % (cvs_rev.cvs_file.cvs_path,)) + + def process_revision(self, cvs_rev, post_commit): + if isinstance(cvs_rev, CVSRevisionAdd): + self.add_file(cvs_rev, post_commit) + elif isinstance(cvs_rev, CVSRevisionChange): + self.modify_file(cvs_rev, post_commit) + elif isinstance(cvs_rev, CVSRevisionDelete): + self.delete_file(cvs_rev, post_commit) + elif isinstance(cvs_rev, CVSRevisionNoop): + pass + else: + raise InternalError('Unexpected CVSRevision type: %s' % (cvs_rev,)) + + def branch_file(self, cvs_symbol): + cvs_file = cvs_symbol.cvs_file + parent_node = self._mkdir_p(cvs_file.parent_directory, cvs_symbol.symbol) + parent_node.add_file(cvs_file) + self._modify_file(cvs_symbol, post_commit=False) + + def finish(self): + del self._mirror + del self.f + + +class GitRevisionMarkWriter(GitRevisionWriter): + def _modify_file(self, cvs_item, post_commit): + if cvs_item.cvs_file.executable: + mode = '100755' + else: + mode = '100644' + + self.f.write( + 'M %s :%d %s\n' + % (mode, cvs_item.revision_recorder_token, + cvs_item.cvs_file.cvs_path,) + ) + + +class GitRevisionInlineWriter(GitRevisionWriter): + def __init__(self, revision_reader): + self.revision_reader = revision_reader + + def register_artifacts(self, which_pass): + GitRevisionWriter.register_artifacts(self, which_pass) + self.revision_reader.register_artifacts(which_pass) + + def start(self, f, mirror): + GitRevisionWriter.start(self, f, mirror) + self.revision_reader.start() + + def _modify_file(self, cvs_item, post_commit): + if cvs_item.cvs_file.executable: + mode = '100755' + else: + mode = '100644' + + self.f.write( + 'M %s inline %s\n' + % (mode, cvs_item.cvs_file.cvs_path,) + ) + + if isinstance(cvs_item, CVSSymbol): + cvs_rev = cvs_item.get_cvs_revision_source(Ctx()._cvs_items_db) + else: + cvs_rev = cvs_item + + # FIXME: We have to decide what to do about keyword substitution + # and eol_style here: + fulltext = self.revision_reader.get_content_stream( + cvs_rev, suppress_keyword_substitution=False + ).read() + + self.f.write('data %d\n' % (len(fulltext),)) + self.f.write(fulltext) + self.f.write('\n') + + def finish(self): + GitRevisionWriter.finish(self) + self.revision_reader.finish() + + +def get_chunks(iterable, chunk_size): + """Generate lists containing chunks of the output of ITERABLE. + + Each list contains at most CHUNK_SIZE items. If CHUNK_SIZE is None, + yield the whole contents of ITERABLE in one list.""" + + if chunk_size is None: + yield list(iterable) + else: + it = iter(iterable) + while True: + # If this call to it.next() raises StopIteration, then we have + # no more chunks to emit, so simply pass the exception through: + chunk = [it.next()] + + # Now try filling the rest of the chunk: + try: + while len(chunk) < chunk_size: + chunk.append(it.next()) + except StopIteration: + # The iterator was exhausted while filling chunk, but chunk + # contains at least one element. Yield it, then we're done. + yield chunk + break + + # Yield the full chunk then continue with the next chunk: + yield chunk + del chunk + + +class GitOutputOption(OutputOption): + """An OutputOption that outputs to a git-fast-import formatted file. + + Members: + + dump_filename -- (string) the name of the file to which the + git-fast-import commands for defining revisions will be + written. + + author_transforms -- a map {cvsauthor : (fullname, email)} from + CVS author names to git full name and email address. All of + the contents are 8-bit strings encoded as UTF-8. + + """ + + # The first mark number used for git-fast-import commit marks. This + # value needs to be large to avoid conflicts with blob marks. + _first_commit_mark = 1000000000 + + def __init__( + self, dump_filename, revision_writer, + max_merges=None, author_transforms=None, + ): + """Constructor. + + DUMP_FILENAME is the name of the file to which the git-fast-import + commands for defining revisions should be written. (Please note + that depending on the style of revision writer, the actual file + contents might not be written to this file.) + + REVISION_WRITER is a GitRevisionWriter that is used to output + either the content of revisions or a mark that was previously used + to label a blob. + + MAX_MERGES can be set to an integer telling the maximum number of + parents that can be merged into a commit at once (aside from the + natural parent). If it is set to None, then there is no limit. + + AUTHOR_TRANSFORMS is a map {cvsauthor : (fullname, email)} from + CVS author names to git full name and email address. All of the + contents should either be Unicode strings or 8-bit strings encoded + as UTF-8. + + """ + + self.dump_filename = dump_filename + self.revision_writer = revision_writer + self.max_merges = max_merges + + def to_utf8(s): + if isinstance(s, unicode): + return s.encode('utf8') + else: + return s + + self.author_transforms = {} + if author_transforms is not None: + for (cvsauthor, (name, email,)) in author_transforms.iteritems(): + cvsauthor = to_utf8(cvsauthor) + name = to_utf8(name) + email = to_utf8(email) + self.author_transforms[cvsauthor] = (name, email,) + + self._mirror = RepositoryMirror() + + self._mark_generator = KeyGenerator(GitOutputOption._first_commit_mark) + + def register_artifacts(self, which_pass): + # These artifacts are needed for SymbolingsReader: + artifact_manager.register_temp_file_needed( + config.SYMBOL_OPENINGS_CLOSINGS_SORTED, which_pass + ) + artifact_manager.register_temp_file_needed( + config.SYMBOL_OFFSETS_DB, which_pass + ) + self.revision_writer.register_artifacts(which_pass) + self._mirror.register_artifacts(which_pass) + + def check(self): + if Ctx().cross_project_commits: + raise FatalError( + 'Git output is not supported with cross-project commits' + ) + if Ctx().cross_branch_commits: + raise FatalError( + 'Git output is not supported with cross-branch commits' + ) + if Ctx().username is None: + raise FatalError( + 'Git output requires a default commit username' + ) + + def check_symbols(self, symbol_map): + # FIXME: What constraints does git impose on symbols? + pass + + def setup(self, svn_rev_count): + self._symbolings_reader = SymbolingsReader() + self.f = open(self.dump_filename, 'wb') + + # The youngest revnum that has been committed so far: + self._youngest = 0 + + # A map {lod : [(revnum, mark)]} giving each of the revision + # numbers in which there was a commit to lod, and the mark active + # at the end of the revnum. + self._marks = {} + + self._mirror.open() + self.revision_writer.start(self.f, self._mirror) + + def _create_commit_mark(self, lod, revnum): + mark = self._mark_generator.gen_id() + self._set_lod_mark(lod, revnum, mark) + return mark + + def _set_lod_mark(self, lod, revnum, mark): + """Record MARK as the status of LOD for REVNUM. + + If there is already an entry for REVNUM, overwrite it. If not, + append a new entry to the self._marks list for LOD.""" + + assert revnum >= self._youngest + entry = (revnum, mark) + try: + modifications = self._marks[lod] + except KeyError: + # This LOD hasn't appeared before; create a new list and add the + # entry: + self._marks[lod] = [entry] + else: + # A record exists, so it necessarily has at least one element: + if modifications[-1][0] == revnum: + modifications[-1] = entry + else: + modifications.append(entry) + self._youngest = revnum + + def _get_author(self, svn_commit): + """Return the author to be used for SVN_COMMIT. + + Return the author in the form needed by git; that is, 'foo '.""" + + author = svn_commit.get_author() + (name, email,) = self.author_transforms.get(author, (author, author,)) + return '%s <%s>' % (name, email,) + + @staticmethod + def _get_log_msg(svn_commit): + return svn_commit.get_log_msg() + + def process_initial_project_commit(self, svn_commit): + self._mirror.start_commit(svn_commit.revnum) + self._mirror.end_commit() + + def process_primary_commit(self, svn_commit): + author = self._get_author(svn_commit) + log_msg = self._get_log_msg(svn_commit) + + lods = set() + for cvs_rev in svn_commit.get_cvs_items(): + lods.add(cvs_rev.lod) + if len(lods) != 1: + raise InternalError('Commit affects %d LODs' % (len(lods),)) + lod = lods.pop() + + self._mirror.start_commit(svn_commit.revnum) + if isinstance(lod, Trunk): + # FIXME: is this correct?: + self.f.write('commit refs/heads/master\n') + else: + self.f.write('commit refs/heads/%s\n' % (lod.name,)) + self.f.write( + 'mark :%d\n' + % (self._create_commit_mark(lod, svn_commit.revnum),) + ) + self.f.write( + 'committer %s %d +0000\n' % (author, svn_commit.date,) + ) + self.f.write('data %d\n' % (len(log_msg),)) + self.f.write('%s\n' % (log_msg,)) + for cvs_rev in svn_commit.get_cvs_items(): + self.revision_writer.process_revision(cvs_rev, post_commit=False) + + self.f.write('\n') + self._mirror.end_commit() + + def process_post_commit(self, svn_commit): + author = self._get_author(svn_commit) + log_msg = self._get_log_msg(svn_commit) + + source_lods = set() + for cvs_rev in svn_commit.cvs_revs: + source_lods.add(cvs_rev.lod) + if len(source_lods) != 1: + raise InternalError('Commit is from %d LODs' % (len(source_lods),)) + source_lod = source_lods.pop() + + self._mirror.start_commit(svn_commit.revnum) + # FIXME: is this correct?: + self.f.write('commit refs/heads/master\n') + self.f.write( + 'mark :%d\n' + % (self._create_commit_mark(None, svn_commit.revnum),) + ) + self.f.write( + 'committer %s %d +0000\n' % (author, svn_commit.date,) + ) + self.f.write('data %d\n' % (len(log_msg),)) + self.f.write('%s\n' % (log_msg,)) + self.f.write( + 'merge :%d\n' + % (self._get_source_mark(source_lod, svn_commit.revnum),) + ) + for cvs_rev in svn_commit.cvs_revs: + self.revision_writer.process_revision(cvs_rev, post_commit=True) + + self.f.write('\n') + self._mirror.end_commit() + + def _get_source_groups(self, svn_commit): + """Return groups of sources for SVN_COMMIT. + + SVN_COMMIT is an instance of SVNSymbolCommit. Yield tuples + (source_lod, svn_revnum, cvs_symbols) where source_lod is the line + of development and svn_revnum is the revision that should serve as + a source, and cvs_symbols is a list of CVSSymbolItems that can be + copied from that source. The groups are returned in arbitrary + order.""" + + # Get a map {CVSSymbol : SVNRevisionRange}: + range_map = self._symbolings_reader.get_range_map(svn_commit) + + # range_map, split up into one map per LOD; i.e., {LOD : + # {CVSSymbol : SVNRevisionRange}}: + lod_range_maps = {} + + for (cvs_symbol, range) in range_map.iteritems(): + lod_range_map = lod_range_maps.get(range.source_lod) + if lod_range_map is None: + lod_range_map = {} + lod_range_maps[range.source_lod] = lod_range_map + lod_range_map[cvs_symbol] = range + + # Sort the sources so that the branch that serves most often as + # parent is processed first: + lod_ranges = lod_range_maps.items() + lod_ranges.sort( + lambda (lod1,lod_range_map1),(lod2,lod_range_map2): + -cmp(len(lod_range_map1), len(lod_range_map2)) or cmp(lod1, lod2) + ) + + for (lod, lod_range_map) in lod_ranges: + while lod_range_map: + revision_scores = RevisionScores(lod_range_map.values()) + (source_lod, revnum, score) = revision_scores.get_best_revnum() + assert source_lod == lod + cvs_symbols = [] + for (cvs_symbol, range) in lod_range_map.items(): + if revnum in range: + cvs_symbols.append(cvs_symbol) + del lod_range_map[cvs_symbol] + yield (lod, revnum, cvs_symbols) + + def _get_all_files(self, node): + """Generate all of the CVSFiles under NODE.""" + + for cvs_path in node: + subnode = node[cvs_path] + if subnode is None: + yield cvs_path + else: + for sub_cvs_path in self._get_all_files(subnode): + yield sub_cvs_path + + def _is_simple_copy(self, svn_commit, source_groups): + """Return True iff SVN_COMMIT can be created as a simple copy. + + SVN_COMMIT is an SVNTagCommit. Return True iff it can be created + as a simple copy from an existing revision (i.e., if the fixup + branch can be avoided for this tag creation).""" + + # The first requirement is that there be exactly one source: + if len(source_groups) != 1: + return False + + (source_lod, svn_revnum, cvs_symbols) = source_groups[0] + + # The second requirement is that the destination LOD not already + # exist: + try: + self._mirror.get_current_lod_directory(svn_commit.symbol) + except KeyError: + # The LOD doesn't already exist. This is good. + pass + else: + # The LOD already exists. It cannot be created by a copy. + return False + + # The third requirement is that the source LOD contains exactly + # the same files as we need to add to the symbol: + try: + source_node = self._mirror.get_old_lod_directory(source_lod, svn_revnum) + except KeyError: + raise InternalError('Source %r does not exist' % (source_lod,)) + return ( + set([cvs_symbol.cvs_file for cvs_symbol in cvs_symbols]) + == set(self._get_all_files(source_node)) + ) + + def _get_source_mark(self, source_lod, revnum): + """Return the mark active on SOURCE_LOD at the end of REVNUM.""" + + modifications = self._marks[source_lod] + i = bisect.bisect_left(modifications, (revnum + 1,)) - 1 + (revnum, mark) = modifications[i] + return mark + + def _process_symbol_commit( + self, svn_commit, git_branch, source_groups, mark + ): + author = self._get_author(svn_commit) + log_msg = self._get_log_msg(svn_commit) + + self.f.write('commit %s\n' % (git_branch,)) + self.f.write('mark :%d\n' % (mark,)) + self.f.write('committer %s %d +0000\n' % (author, svn_commit.date,)) + self.f.write('data %d\n' % (len(log_msg),)) + self.f.write('%s\n' % (log_msg,)) + + for (source_lod, source_revnum, cvs_symbols,) in source_groups: + self.f.write( + 'merge :%d\n' + % (self._get_source_mark(source_lod, source_revnum),) + ) + + for (source_lod, source_revnum, cvs_symbols,) in source_groups: + for cvs_symbol in cvs_symbols: + self.revision_writer.branch_file(cvs_symbol) + + self.f.write('\n') + + def process_branch_commit(self, svn_commit): + self._mirror.start_commit(svn_commit.revnum) + source_groups = list(self._get_source_groups(svn_commit)) + for groups in get_chunks(source_groups, self.max_merges): + self._process_symbol_commit( + svn_commit, 'refs/heads/%s' % (svn_commit.symbol.name,), + groups, + self._create_commit_mark(svn_commit.symbol, svn_commit.revnum), + ) + self._mirror.end_commit() + + def _set_symbol(self, symbol, mark): + if isinstance(symbol, Branch): + category = 'heads' + elif isinstance(symbol, Tag): + category = 'tags' + else: + raise InternalError() + self.f.write('reset refs/%s/%s\n' % (category, symbol.name,)) + self.f.write('from :%d\n' % (mark,)) + + def process_tag_commit(self, svn_commit): + # FIXME: For now we create a fixup branch with the same name as + # the tag, then the tag. We never delete the fixup branch. Also, + # a fixup branch is created even if the tag could be created from + # a single source. + self._mirror.start_commit(svn_commit.revnum) + + source_groups = list(self._get_source_groups(svn_commit)) + if self._is_simple_copy(svn_commit, source_groups): + (source_lod, source_revnum, cvs_symbols) = source_groups[0] + Log().debug( + '%s will be created via a simple copy from %s:r%d' + % (svn_commit.symbol, source_lod, source_revnum,) + ) + mark = self._get_source_mark(source_lod, source_revnum) + self._set_symbol(svn_commit.symbol, mark) + else: + Log().debug( + '%s will be created via a fixup branch' % (svn_commit.symbol,) + ) + + # Create the fixup branch (which might involve making more than + # one commit): + for groups in get_chunks(source_groups, self.max_merges): + mark = self._create_commit_mark(svn_commit.symbol, svn_commit.revnum) + self._process_symbol_commit( + svn_commit, FIXUP_BRANCH_NAME, groups, mark + ) + + # Store the mark of the last commit to the fixup branch as the + # value of the tag: + self._set_symbol(svn_commit.symbol, mark) + self.f.write('reset %s\n' % (FIXUP_BRANCH_NAME,)) + self.f.write('\n') + + self._mirror.end_commit() + + def cleanup(self): + self.revision_writer.finish() + self._mirror.close() + self.f.close() + del self.f + self._symbolings_reader.close() + del self._symbolings_reader + + diff --git a/cvs2svn_lib/git_revision_recorder.py b/cvs2svn_lib/git_revision_recorder.py new file mode 100644 index 0000000..604f8ac --- /dev/null +++ b/cvs2svn_lib/git_revision_recorder.py @@ -0,0 +1,114 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Write file contents to a stream of git-fast-import blobs.""" + +import itertools + +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.cvs_item import CVSRevisionDelete +from cvs2svn_lib.cvs_item import CVSSymbol +from cvs2svn_lib.fulltext_revision_recorder import FulltextRevisionRecorder +from cvs2svn_lib.key_generator import KeyGenerator + + +class GitRevisionRecorder(FulltextRevisionRecorder): + """Output file revisions to git-fast-import.""" + + def __init__(self, blob_filename): + self.blob_filename = blob_filename + + def start(self): + self.dump_file = open(self.blob_filename, 'wb') + self._mark_generator = KeyGenerator() + + def start_file(self, cvs_file_items): + self._cvs_file_items = cvs_file_items + + def _get_original_source(self, cvs_rev): + """Return the original source of the contents of CVS_REV. + + Return the first non-delete CVSRevision with the same contents as + CVS_REV. 'First' here refers to deltatext order; i.e., the very + first revision is HEAD on trunk, then backwards to the root of a + branch, then out to the tip of a branch. + + The candidates are all revisions along the CVS delta-dependency + chain until the next one that has a deltatext (inclusive). Of the + candidates, CVSRevisionDeletes are disqualified because, even + though CVS records their contents, it is impossible to extract + their fulltext using commands like 'cvs checkout -p'. + + If there is no other CVSRevision that has the same content, return + CVS_REV itself.""" + + # Keep track of the "best" source CVSRevision found so far: + best_source_rev = None + + for cvs_rev in itertools.chain( + [cvs_rev], self._cvs_file_items.iter_deltatext_ancestors(cvs_rev) + ): + if not isinstance(cvs_rev, CVSRevisionDelete): + best_source_rev = cvs_rev + + if cvs_rev.deltatext_exists: + break + + return best_source_rev + + def record_fulltext(self, cvs_rev, log, fulltext): + """Write the fulltext to a blob if it is original and not a delete. + + The reason we go to this trouble is to avoid writing the same file + contents multiple times for a string of revisions that don't have + deltatexts (as, for example, happens with dead revisions and + imported revisions).""" + + if isinstance(cvs_rev, CVSRevisionDelete): + # There is no need to record a delete revision, and its token + # will never be needed: + return None + + source = self._get_original_source(cvs_rev) + + if source.id == cvs_rev.id: + # Revision is its own source; write it out: + mark = self._mark_generator.gen_id() + self.dump_file.write('blob\n') + self.dump_file.write('mark :%d\n' % (mark,)) + self.dump_file.write('data %d\n' % (len(fulltext),)) + self.dump_file.write(fulltext) + self.dump_file.write('\n') + return mark + else: + # Return as revision_recorder_token the CVSRevision.id of the + # original source revision: + return source.revision_recorder_token + + def finish_file(self, cvs_file_items): + # Determine the original source of each CVSSymbol, and store it as + # the symbol's revision_recorder_token. + for cvs_item in cvs_file_items.values(): + if isinstance(cvs_item, CVSSymbol): + cvs_source = cvs_item.get_cvs_revision_source(cvs_file_items) + cvs_item.revision_recorder_token = cvs_source.revision_recorder_token + + del self._cvs_file_items + + def finish(self): + self.dump_file.close() + + diff --git a/cvs2svn_lib/git_run_options.py b/cvs2svn_lib/git_run_options.py new file mode 100644 index 0000000..726b127 --- /dev/null +++ b/cvs2svn_lib/git_run_options.py @@ -0,0 +1,274 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module manages cvs2git run options.""" + + +import sys +import datetime +import codecs + +from cvs2svn_lib.version import VERSION +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.run_options import not_both +from cvs2svn_lib.run_options import RunOptions +from cvs2svn_lib.run_options import ContextOption +from cvs2svn_lib.run_options import IncompatibleOption +from cvs2svn_lib.run_options import authors +from cvs2svn_lib.man_writer import ManWriter +from cvs2svn_lib.project import Project +from cvs2svn_lib.rcs_revision_manager import RCSRevisionReader +from cvs2svn_lib.cvs_revision_manager import CVSRevisionReader +from cvs2svn_lib.git_revision_recorder import GitRevisionRecorder +from cvs2svn_lib.git_output_option import GitRevisionMarkWriter +from cvs2svn_lib.git_output_option import GitOutputOption +from cvs2svn_lib.revision_manager import NullRevisionRecorder +from cvs2svn_lib.revision_manager import NullRevisionExcluder +from cvs2svn_lib.fulltext_revision_recorder \ + import SimpleFulltextRevisionRecorderAdapter + + +short_desc = 'convert a cvs repository into a git repository' + +synopsis = """\ +.B cvs2git +[\\fIOPTION\\fR]... \\fIOUTPUT-OPTIONS CVS-REPOS-PATH\\fR +.br +.B cvs2git +[\\fIOPTION\\fR]... \\fI--options=PATH\\fR +""" + +long_desc = """\ +Create a new git repository based on the version history stored in a +CVS repository. Each CVS commit will be mirrored in the git +repository, including such information as date of commit and id of the +committer. +.P +The output of this program are a "blobfile" and a "dumpfile", which +together can be loaded into a git repository using "git fast-import". +.P +\\fICVS-REPOS-PATH\\fR is the filesystem path of the part of the CVS +repository that you want to convert. This path doesn't have to be the +top level directory of a CVS repository; it can point at a project +within a repository, in which case only that project will be +converted. This path or one of its parent directories has to contain +a subdirectory called CVSROOT (though the CVSROOT directory can be +empty). +.P +It is not possible directly to convert a CVS repository to which you +only have remote access, but the FAQ describes tools that may be used +to create a local copy of a remote CVS repository. +""" + +files = """\ +A directory called \\fIcvs2svn-tmp\\fR (or the directory specified by +\\fB--tmpdir\\fR) is used as scratch space for temporary data files. +""" + +see_also = [ + ('cvs', '1'), + ('git', '1'), + ('git-fast-import', '1'), + ] + + +class GitRunOptions(RunOptions): + def __init__(self, progname, cmd_args, pass_manager): + Ctx().cross_project_commits = False + Ctx().cross_branch_commits = False + RunOptions.__init__(self, progname, cmd_args, pass_manager) + + def _get_output_options_group(self): + group = RunOptions._get_output_options_group(self) + + group.add_option(IncompatibleOption( + '--blobfile', type='string', + action='store', + help='path to which the "blob" data should be written', + man_help=( + 'Write the "blob" data (containing revision contents) to ' + '\\fIpath\\fR.' + ), + metavar='PATH', + )) + group.add_option(IncompatibleOption( + '--dumpfile', type='string', + action='store', + help='path to which the revision data should be written', + man_help=( + 'Write the revision data (branches and commits) to \\fIpath\\fR.' + ), + metavar='PATH', + )) + group.add_option(ContextOption( + '--dry-run', + action='store_true', + help=( + 'do not create any output; just print what would happen.' + ), + man_help=( + 'Do not create any output; just print what would happen.' + ), + )) + + return group + + def _get_extraction_options_group(self): + group = RunOptions._get_extraction_options_group(self) + + self.parser.set_default('use_cvs', False) + group.add_option(IncompatibleOption( + '--use-cvs', + action='store_true', + help=( + 'use CVS to extract revision contents (slower than ' + '--use-rcs but more reliable) (default)' + ), + man_help=( + 'Use CVS to extract revision contents. This option is slower ' + 'than \\fB--use-rcs\\fR but more reliable.' + ), + )) + self.parser.set_default('use_rcs', False) + group.add_option(IncompatibleOption( + '--use-rcs', + action='store_true', + help=( + 'use RCS to extract revision contents (faster than ' + '--use-cvs but fails in some cases)' + ), + man_help=( + 'Use RCS \'co\' to extract revision contents. This option is ' + 'faster than \\fB--use-cvs\\fR but fails in some cases.' + ), + )) + + return group + + def callback_manpage(self, option, opt_str, value, parser): + f = codecs.getwriter('utf_8')(sys.stdout) + ManWriter( + parser, + section='1', + date=datetime.date.today(), + source='Version %s' % (VERSION,), + manual='User Commands', + short_desc=short_desc, + synopsis=synopsis, + long_desc=long_desc, + files=files, + authors=authors, + see_also=see_also, + ).write_manpage(f) + sys.exit(0) + + def process_io_options(self): + """Process input/output options. + + Process options related to extracting data from the CVS repository + and writing to 'git fast-import'-formatted files.""" + + ctx = Ctx() + options = self.options + + not_both(options.use_rcs, '--use-rcs', + options.use_cvs, '--use-cvs') + + if options.use_rcs: + revision_reader = RCSRevisionReader( + co_executable=options.co_executable + ) + else: + # --use-cvs is the default: + revision_reader = CVSRevisionReader( + cvs_executable=options.cvs_executable + ) + + if ctx.dry_run: + ctx.revision_recorder = NullRevisionRecorder() + else: + if not (options.blobfile and options.dumpfile): + raise FatalError("must pass '--blobfile' and '--dumpfile' options.") + ctx.revision_recorder = SimpleFulltextRevisionRecorderAdapter( + revision_reader, + GitRevisionRecorder(options.blobfile), + ) + + ctx.revision_excluder = NullRevisionExcluder() + ctx.revision_reader = None + + ctx.output_option = GitOutputOption( + options.dumpfile, + GitRevisionMarkWriter(), + max_merges=None, + # Optional map from CVS author names to git author names: + author_transforms={}, # FIXME + ) + + def set_project( + self, + project_cvs_repos_path, + symbol_transforms=None, + symbol_strategy_rules=[], + ): + """Set the project to be converted. + + If a project had already been set, overwrite it. + + Most arguments are passed straight through to the Project + constructor. SYMBOL_STRATEGY_RULES is an iterable of + SymbolStrategyRules that will be applied to symbols in this + project.""" + + symbol_strategy_rules = list(symbol_strategy_rules) + + project = Project( + 0, + project_cvs_repos_path, + symbol_transforms=symbol_transforms, + ) + + self.projects = [project] + self.project_symbol_strategy_rules = [symbol_strategy_rules] + + def process_options(self): + # Consistency check for options and arguments. + if len(self.args) == 0: + self.usage() + sys.exit(1) + + if len(self.args) > 1: + Log().error(error_prefix + ": must pass only one CVS repository.\n") + self.usage() + sys.exit(1) + + cvsroot = self.args[0] + + self.process_io_options() + self.process_symbol_strategy_options() + self.process_property_setter_options() + + # Create the project: + self.set_project( + cvsroot, + symbol_transforms=self.options.symbol_transforms, + symbol_strategy_rules=self.options.symbol_strategy_rules, + ) + + diff --git a/cvs2svn_lib/key_generator.py b/cvs2svn_lib/key_generator.py new file mode 100644 index 0000000..d580d6b --- /dev/null +++ b/cvs2svn_lib/key_generator.py @@ -0,0 +1,45 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the KeyGenerator class.""" + + +class KeyGenerator: + """Generate a series of unique keys.""" + + def __init__(self, first_id=1): + """Initialize a KeyGenerator with the specified FIRST_ID. + + FIRST_ID should be an int or long, and the generated keys will be + of the same type.""" + + self._key_base = first_id + self._last_id = None + + def gen_id(self): + """Generate and return a previously-unused key, as an integer.""" + + self._last_id = self._key_base + self._key_base += 1 + + return self._last_id + + def get_last_id(self): + """Return the last id that was generated, as an integer.""" + + return self._last_id + + diff --git a/cvs2svn_lib/log.py b/cvs2svn_lib/log.py new file mode 100644 index 0000000..798350c --- /dev/null +++ b/cvs2svn_lib/log.py @@ -0,0 +1,174 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains a simple logging facility for cvs2svn.""" + + +import sys +import time +import threading + + +class Log: + """A Simple logging facility. + + If self.log_level is DEBUG or higher, each line will be timestamped + with the number of wall-clock seconds since the time when this + module was first imported. + + If self.use_timestamps is True, each line will be timestamped with a + human-readable clock time. + + The public methods of this class are thread-safe. + + This class is a Borg; see + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.""" + + # These constants represent the log levels that this class supports. + # The increase_verbosity() and decrease_verbosity() methods rely on + # these constants being consecutive integers: + ERROR = -2 + WARN = -1 + QUIET = 0 + NORMAL = 1 + VERBOSE = 2 + DEBUG = 3 + + start_time = time.time() + + __shared_state = {} + + def __init__(self): + self.__dict__ = self.__shared_state + if self.__dict__: + return + + self.log_level = Log.NORMAL + + # Set this to True if you want to see timestamps on each line output. + self.use_timestamps = False + + # The output file to use for errors: + self._err = sys.stderr + + # The output file to use for lower-priority messages: + self._out = sys.stdout + + # Lock to serialize writes to the log: + self.lock = threading.Lock() + + def increase_verbosity(self): + self.lock.acquire() + try: + self.log_level = min(self.log_level + 1, Log.DEBUG) + finally: + self.lock.release() + + def decrease_verbosity(self): + self.lock.acquire() + try: + self.log_level = max(self.log_level - 1, Log.ERROR) + finally: + self.lock.release() + + def is_on(self, level): + """Return True iff messages at the specified LEVEL are currently on. + + LEVEL should be one of the constants Log.WARN, Log.QUIET, etc.""" + + return self.log_level >= level + + def _timestamp(self): + """Return a timestamp if needed, as a string with a trailing space.""" + + retval = [] + + if self.log_level >= Log.DEBUG: + retval.append('%f: ' % (time.time() - self.start_time,)) + + if self.use_timestamps: + retval.append(time.strftime('[%Y-%m-%d %I:%M:%S %Z] - ')) + + return ''.join(retval) + + def _write(self, out, *args): + """Write a message to OUT. + + If there are multiple ARGS, they will be separated by spaces. If + there are multiple lines, they will be output one by one with the + same timestamp prefix.""" + + timestamp = self._timestamp() + s = ' '.join(map(str, args)) + lines = s.split('\n') + if lines and not lines[-1]: + del lines[-1] + + self.lock.acquire() + try: + for s in lines: + out.write('%s%s\n' % (timestamp, s,)) + # Ensure that log output doesn't get out-of-order with respect to + # stderr output. + out.flush() + finally: + self.lock.release() + + def write(self, *args): + """Write a message to SELF._out. + + This is a public method to use for writing to the output log + unconditionally.""" + + self._write(self._out, *args) + + def error(self, *args): + """Log a message at the ERROR level.""" + + if self.is_on(Log.ERROR): + self._write(self._err, *args) + + def warn(self, *args): + """Log a message at the WARN level.""" + + if self.is_on(Log.WARN): + self._write(self._out, *args) + + def quiet(self, *args): + """Log a message at the QUIET level.""" + + if self.is_on(Log.QUIET): + self._write(self._out, *args) + + def normal(self, *args): + """Log a message at the NORMAL level.""" + + if self.is_on(Log.NORMAL): + self._write(self._out, *args) + + def verbose(self, *args): + """Log a message at the VERBOSE level.""" + + if self.is_on(Log.VERBOSE): + self._write(self._out, *args) + + def debug(self, *args): + """Log a message at the DEBUG level.""" + + if self.is_on(Log.DEBUG): + self._write(self._out, *args) + + diff --git a/cvs2svn_lib/main.py b/cvs2svn_lib/main.py new file mode 100644 index 0000000..492c49e --- /dev/null +++ b/cvs2svn_lib/main.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python2 +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +import os +import errno +import gc + +try: + # Try to get access to a bunch of encodings for use with --encoding. + # See http://cjkpython.i18n.org/ for details. + import iconv_codec +except ImportError: + pass + +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.svn_run_options import SVNRunOptions +from cvs2svn_lib.git_run_options import GitRunOptions +from cvs2svn_lib.bzr_run_options import BzrRunOptions +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.pass_manager import PassManager +from cvs2svn_lib.passes import passes + + +def main(progname, run_options, pass_manager): + # Disable garbage collection, as we try not to create any circular + # data structures: + gc.disable() + + # Convenience var, so we don't have to keep instantiating this Borg. + ctx = Ctx() + + # Make sure the tmp directory exists. Note that we don't check if + # it's empty -- we want to be able to use, for example, "." to hold + # tempfiles. But if we *did* want check if it were empty, we'd do + # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-). + if not os.path.exists(ctx.tmpdir): + erase_tmpdir = True + os.mkdir(ctx.tmpdir) + elif not os.path.isdir(ctx.tmpdir): + raise FatalError( + "cvs2svn tried to use '%s' for temporary files, but that path\n" + " exists and is not a directory. Please make it be a directory,\n" + " or specify some other directory for temporary files." + % (ctx.tmpdir,)) + else: + erase_tmpdir = False + + # But do lock the tmpdir, to avoid process clash. + try: + os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock')) + except OSError, e: + if e.errno == errno.EACCES: + raise FatalError("Permission denied:" + + " No write access to directory '%s'." % ctx.tmpdir) + if e.errno == errno.EEXIST: + raise FatalError( + "cvs2svn is using directory '%s' for temporary files, but\n" + " subdirectory '%s/cvs2svn.lock' exists, indicating that another\n" + " cvs2svn process is currently using '%s' as its temporary\n" + " workspace. If you are certain that is not the case,\n" + " then remove the '%s/cvs2svn.lock' subdirectory." + % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,)) + raise + + try: + if run_options.profiling: + import hotshot + prof = hotshot.Profile('cvs2svn.hotshot') + prof.runcall(pass_manager.run, run_options) + prof.close() + else: + pass_manager.run(run_options) + finally: + try: + os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock')) + except: + pass + + if erase_tmpdir: + try: + os.rmdir(ctx.tmpdir) + except: + pass + + +def svn_main(progname, cmd_args): + pass_manager = PassManager(passes) + run_options = SVNRunOptions(progname, cmd_args, pass_manager) + main(progname, run_options, pass_manager) + + +def git_main(progname, cmd_args): + pass_manager = PassManager(passes) + run_options = GitRunOptions(progname, cmd_args, pass_manager) + main(progname, run_options, pass_manager) + + +def bzr_main(progname, cmd_args): + pass_manager = PassManager(passes) + run_options = BzrRunOptions(progname, cmd_args, pass_manager) + main(progname, run_options, pass_manager) + + diff --git a/cvs2svn_lib/man_writer.py b/cvs2svn_lib/man_writer.py new file mode 100644 index 0000000..3cca8c9 --- /dev/null +++ b/cvs2svn_lib/man_writer.py @@ -0,0 +1,197 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the ManWriter class for outputting manpages.""" + + +import datetime +import optparse +import re + + +whitespace_re = re.compile(r'\s+') + +def wrap(s, width=70): + # Convert all whitespace substrings to single spaces: + s = whitespace_re.sub(' ', s) + s = s.strip() + retval = [] + while s: + if len(s) <= width: + retval.append(s) + break + i = s.rfind(' ', 0, width + 1) + if i == -1: + # There were no spaces within the first width+1 characters; break + # at the next space after width: + i = s.find(' ', width + 1) + if i == -1: + # There were no spaces in s at all. + retval.append(s) + break + + retval.append(s[:i].rstrip()) + s = s[i+1:].lstrip() + + for (i,line) in enumerate(retval): + if line.startswith('\'') or line.startswith('.'): + # These are roff control characters and have to be escaped: + retval[i] = '\\' + line + + return '\n'.join(retval) + + +class ManOption(optparse.Option): + """An optparse.Option that holds an explicit string for the man page.""" + + def __init__(self, *args, **kw): + self.man_help = kw.pop('man_help') + optparse.Option.__init__(self, *args, **kw) + + +class ManWriter(object): + def __init__( + self, + parser, + section, date, source, manual, + short_desc, synopsis, long_desc, files, authors, see_also, + ): + self.parser = parser + self.section = section + self.date = date + self.source = source + self.manual = manual + self.short_desc = short_desc + self.synopsis = synopsis + self.long_desc = long_desc + self.files = files + self.authors = authors + self.see_also = see_also + + def write_title(self, f): + f.write('.\\" Process this file with\n') + f.write( + '.\\" groff -man -Tascii %s.%s\n' % ( + self.parser.get_prog_name(), + self.section, + ) + ) + f.write( + '.TH %s "%s" "%s" "%s" "%s"\n' % ( + self.parser.get_prog_name().upper(), + self.section, + self.date.strftime('%b %d, %Y'), + self.source, + self.manual, + ) + ) + + def write_name(self, f): + f.write('.SH "NAME"\n') + f.write( + '%s \- %s\n' % ( + self.parser.get_prog_name(), + self.short_desc, + ) + ) + + def write_synopsis(self, f): + f.write('.SH "SYNOPSIS"\n') + f.write(self.synopsis) + + def write_description(self, f): + f.write('.SH "DESCRIPTION"\n') + f.write(self.long_desc) + + def _get_option_strings(self, option): + """Return a list of option strings formatted with their metavariables. + + This method is very similar to + optparse.HelpFormatter.format_option_strings(). + + """ + + if option.takes_value(): + metavar = (option.metavar or option.dest).lower() + short_opts = [ + '\\fB%s\\fR \\fI%s\\fR' % (opt, metavar) + for opt in option._short_opts + ] + long_opts = [ + '\\fB%s\\fR=\\fI%s\\fR' % (opt, metavar) + for opt in option._long_opts + ] + else: + short_opts = [ + '\\fB%s\\fR' % (opt,) + for opt in option._short_opts + ] + long_opts = [ + '\\fB%s\\fR' % (opt,) + for opt in option._long_opts + ] + + return short_opts + long_opts + + def _write_option(self, f, option): + man_help = getattr(option, 'man_help', option.help) + + if man_help is not optparse.SUPPRESS_HELP: + man_help = wrap(man_help) + f.write('.IP "%s"\n' % (', '.join(self._get_option_strings(option)),)) + f.write('%s\n' % (man_help,)) + + def _write_container_help(self, f, container): + for option in container.option_list: + if option.help is not optparse.SUPPRESS_HELP: + self._write_option(f, option) + + def write_options(self, f): + f.write('.SH "OPTIONS"\n') + if self.parser.option_list: + (self._write_container_help(f, self.parser)) + for group in self.parser.option_groups: + f.write('.SH "%s"\n' % (group.title.upper(),)) + if group.description: + f.write(self.format_description(group.description) + '\n') + self._write_container_help(f, group) + + def write_files(self, f): + f.write('.SH "FILES"\n') + f.write(self.files) + + def write_authors(self, f): + f.write('.SH "AUTHORS"\n') + f.write(self.authors) + + def write_see_also(self, f): + f.write('.SH "SEE ALSO"\n') + f.write(', '.join([ + '%s(%s)' % (name, section,) + for (name, section,) in self.see_also + ]) + '\n') + + def write_manpage(self, f): + self.write_title(f) + self.write_name(f) + self.write_synopsis(f) + self.write_description(f) + self.write_options(f) + self.write_files(f) + self.write_authors(f) + self.write_see_also(f) + + diff --git a/cvs2svn_lib/metadata.py b/cvs2svn_lib/metadata.py new file mode 100644 index 0000000..6cd1337 --- /dev/null +++ b/cvs2svn_lib/metadata.py @@ -0,0 +1,26 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Represent CVSRevision metadata.""" + + +class Metadata(object): + def __init__(self, id, author, log_msg): + self.id = id + self.author = author + self.log_msg = log_msg + + diff --git a/cvs2svn_lib/metadata_database.py b/cvs2svn_lib/metadata_database.py new file mode 100644 index 0000000..de01920 --- /dev/null +++ b/cvs2svn_lib/metadata_database.py @@ -0,0 +1,102 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to manage CVSRevision metadata.""" + + +try: + from hashlib import sha1 +except ImportError: + from sha import new as sha1 + +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.database import IndexedDatabase +from cvs2svn_lib.key_generator import KeyGenerator +from cvs2svn_lib.serializer import PrimedPickleSerializer +from cvs2svn_lib.metadata import Metadata + + +def MetadataDatabase(store_filename, index_table_filename, mode): + """A database to store Metadata instances that describe CVSRevisions. + + This database manages a map + + id -> Metadata instance + + where id is a unique identifier for the metadata.""" + + return IndexedDatabase( + store_filename, index_table_filename, + mode, PrimedPickleSerializer((Metadata,)), + ) + + +class MetadataLogger: + """Store and generate IDs for the metadata associated with CVSRevisions. + + We want CVSRevisions that might be able to be combined to have the + same metadata ID, so we want a one-to-one relationship id <-> + metadata. We could simply construct a map {metadata : id}, but the + map would grow too large. Therefore, we generate a digest + containing the significant parts of the metadata, and construct a + map {digest : id}. + + To get the ID for a new set of metadata, we first create the digest. + If there is already an ID registered for that digest, we simply + return it. If not, we generate a new ID, store the metadata in the + metadata database under that ID, record the mapping {digest : id}, + and return the new id. + + What metadata is included in the digest? The author, log_msg, + project_id (if Ctx().cross_project_commits is not set), and + branch_name (if Ctx().cross_branch_commits is not set).""" + + def __init__(self, metadata_db): + self._metadata_db = metadata_db + + # A map { digest : id }: + self._digest_to_id = {} + + # A key_generator to generate keys for metadata that haven't been + # seen yet: + self.key_generator = KeyGenerator() + + def store(self, project, branch_name, author, log_msg): + """Store the metadata and return its id. + + Locate the record for a commit with the specified (PROJECT, + BRANCH_NAME, AUTHOR, LOG_MSG) and return its id. (Depending on + policy, not all of these items are necessarily used when creating + the unique id.) If there is no such record, create one and return + its newly-generated id.""" + + key = [author, log_msg] + if not Ctx().cross_project_commits: + key.append('%x' % project.id) + if not Ctx().cross_branch_commits: + key.append(branch_name or '') + + digest = sha1('\0'.join(key)).digest() + try: + # See if it is already known: + return self._digest_to_id[digest] + except KeyError: + id = self.key_generator.gen_id() + self._digest_to_id[digest] = id + self._metadata_db[id] = Metadata(id, author, log_msg) + return id + + diff --git a/cvs2svn_lib/openings_closings.py b/cvs2svn_lib/openings_closings.py new file mode 100644 index 0000000..b1d4093 --- /dev/null +++ b/cvs2svn_lib/openings_closings.py @@ -0,0 +1,236 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to keep track of symbol openings/closings.""" + + +import cPickle + +from cvs2svn_lib import config +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.svn_revision_range import SVNRevisionRange + + +# Constants used in SYMBOL_OPENINGS_CLOSINGS +OPENING = 'O' +CLOSING = 'C' + + +class SymbolingsLogger: + """Manage the file that contains lines for symbol openings and closings. + + This data will later be used to determine valid SVNRevision ranges + from which a file can be copied when creating a branch or tag in + Subversion. Do this by finding 'Openings' and 'Closings' for each + file copied onto a branch or tag. + + An 'Opening' is the beginning of the lifetime of the source + (CVSRevision or CVSBranch) from which a given CVSSymbol sprouts. + + The 'Closing' is the SVN revision when the source is deleted or + overwritten. + + For example, on file 'foo.c', branch BEE has branch number 1.2.2 and + obviously sprouts from revision 1.2. Therefore, the SVN revision + when 1.2 is committed is the opening for BEE on path 'foo.c', and + the SVN revision when 1.3 is committed is the closing for BEE on + path 'foo.c'. Note that there may be many revisions chronologically + between 1.2 and 1.3, for example, revisions on branches of 'foo.c', + perhaps even including on branch BEE itself. But 1.3 is the next + revision *on the same line* as 1.2, that is why it is the closing + revision for those symbolic names of which 1.2 is the opening. + + The reason for doing all this hullabaloo is (1) to determine what + range of SVN revision numbers can be used as the source of a copy of + a particular file onto a branch/tag, and (2) to minimize the number + of copies and deletes per creation by choosing source SVN revision + numbers that can be used for as many files as possible. + + For example, revisions 1.2 and 1.3 of foo.c might correspond to + revisions 17 and 30 in Subversion. That means that when creating + branch BEE, foo.c has to be copied from a Subversion revision number + in the range 17 <= revnum < 30. Now if there were another file, + 'bar.c', in the same directory, and 'bar.c's opening and closing for + BEE correspond to revisions 24 and 39 in Subversion, then we can + kill two birds with one stone by copying the whole directory from + somewhere in the range 24 <= revnum < 30.""" + + def __init__(self): + self.symbolings = open( + artifact_manager.get_temp_file(config.SYMBOL_OPENINGS_CLOSINGS), 'w') + + def log_revision(self, cvs_rev, svn_revnum): + """Log any openings and closings found in CVS_REV.""" + + for (symbol_id, cvs_symbol_id,) in cvs_rev.opened_symbols: + self._log_opening(symbol_id, cvs_symbol_id, svn_revnum) + + for (symbol_id, cvs_symbol_id) in cvs_rev.closed_symbols: + self._log_closing(symbol_id, cvs_symbol_id, svn_revnum) + + def log_branch_revision(self, cvs_branch, svn_revnum): + """Log any openings and closings found in CVS_BRANCH.""" + + for (symbol_id, cvs_symbol_id,) in cvs_branch.opened_symbols: + self._log_opening(symbol_id, cvs_symbol_id, svn_revnum) + + def _log(self, symbol_id, cvs_symbol_id, svn_revnum, type): + """Log an opening or closing to self.symbolings. + + Write out a single line to the symbol_openings_closings file + representing that SVN_REVNUM is either the opening or closing + (TYPE) of CVS_SYMBOL_ID for SYMBOL_ID. + + TYPE should be one of the following constants: OPENING or CLOSING.""" + + self.symbolings.write( + '%x %d %s %x\n' % (symbol_id, svn_revnum, type, cvs_symbol_id) + ) + + def _log_opening(self, symbol_id, cvs_symbol_id, svn_revnum): + """Log an opening to self.symbolings. + + See _log() for more information.""" + + self._log(symbol_id, cvs_symbol_id, svn_revnum, OPENING) + + def _log_closing(self, symbol_id, cvs_symbol_id, svn_revnum): + """Log a closing to self.symbolings. + + See _log() for more information.""" + + self._log(symbol_id, cvs_symbol_id, svn_revnum, CLOSING) + + def close(self): + self.symbolings.close() + self.symbolings = None + + +class SymbolingsReader: + """Provides an interface to retrieve symbol openings and closings. + + This class accesses the SYMBOL_OPENINGS_CLOSINGS_SORTED file and the + SYMBOL_OFFSETS_DB. Does the heavy lifting of finding and returning + the correct opening and closing Subversion revision numbers for a + given symbolic name and SVN revision number range.""" + + def __init__(self): + """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and + reads the offsets database into memory.""" + + self.symbolings = open( + artifact_manager.get_temp_file( + config.SYMBOL_OPENINGS_CLOSINGS_SORTED), + 'r') + # The offsets_db is really small, and we need to read and write + # from it a fair bit, so suck it into memory + offsets_db = file( + artifact_manager.get_temp_file(config.SYMBOL_OFFSETS_DB), 'rb') + # A map from symbol_id to offset. The values of this map are + # incremented as the openings and closings for a symbol are + # consumed. + self.offsets = cPickle.load(offsets_db) + offsets_db.close() + + def close(self): + self.symbolings.close() + del self.symbolings + del self.offsets + + def _generate_lines(self, symbol): + """Generate the lines for SYMBOL. + + SYMBOL is a TypedSymbol instance. Yield the tuple (revnum, type, + cvs_symbol_id) for all openings and closings for SYMBOL.""" + + if symbol.id in self.offsets: + # Set our read offset for self.symbolings to the offset for this + # symbol: + self.symbolings.seek(self.offsets[symbol.id]) + + while True: + line = self.symbolings.readline().rstrip() + if not line: + break + (id, revnum, type, cvs_symbol_id) = line.split() + id = int(id, 16) + revnum = int(revnum) + if id != symbol.id: + break + cvs_symbol_id = int(cvs_symbol_id, 16) + + yield (revnum, type, cvs_symbol_id) + + def get_range_map(self, svn_symbol_commit): + """Return the ranges of all CVSSymbols in SVN_SYMBOL_COMMIT. + + Return a map { CVSSymbol : SVNRevisionRange }.""" + + # A map { cvs_symbol_id : CVSSymbol }: + cvs_symbol_map = {} + for cvs_symbol in svn_symbol_commit.get_cvs_items(): + cvs_symbol_map[cvs_symbol.id] = cvs_symbol + + range_map = {} + + for (revnum, type, cvs_symbol_id) \ + in self._generate_lines(svn_symbol_commit.symbol): + cvs_symbol = cvs_symbol_map.get(cvs_symbol_id) + if cvs_symbol is None: + # This CVSSymbol is not part of SVN_SYMBOL_COMMIT. + continue + range = range_map.get(cvs_symbol) + if type == OPENING: + if range is not None: + raise InternalError( + 'Multiple openings logged for %r' % (cvs_symbol,) + ) + range_map[cvs_symbol] = SVNRevisionRange( + cvs_symbol.source_lod, revnum + ) + else: + if range is None: + raise InternalError( + 'Closing precedes opening for %r' % (cvs_symbol,) + ) + if range.closing_revnum is not None: + raise InternalError( + 'Multiple closings logged for %r' % (cvs_symbol,) + ) + range.add_closing(revnum) + + # Make sure that all CVSSymbols are accounted for, and adjust the + # closings to be not later than svn_symbol_commit.revnum. + for cvs_symbol in cvs_symbol_map.itervalues(): + try: + range = range_map[cvs_symbol] + except KeyError: + raise InternalError('No opening for %s' % (cvs_symbol,)) + + if range.opening_revnum >= svn_symbol_commit.revnum: + raise InternalError( + 'Opening in r%d not ready for %s in r%d' + % (range.opening_revnum, cvs_symbol, svn_symbol_commit.revnum,) + ) + + if range.closing_revnum is not None \ + and range.closing_revnum > svn_symbol_commit.revnum: + range.closing_revnum = None + + return range_map + + diff --git a/cvs2svn_lib/output_option.py b/cvs2svn_lib/output_option.py new file mode 100644 index 0000000..70419e6 --- /dev/null +++ b/cvs2svn_lib/output_option.py @@ -0,0 +1,85 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes that hold the cvs2svn output options.""" + + +class OutputOption: + """Represents an output choice for a run of cvs2svn.""" + + def register_artifacts(self, which_pass): + """Register artifacts that will be needed for this output option. + + WHICH_PASS is the pass that will call our callbacks, so it should + be used to do the registering (e.g., call + WHICH_PASS.register_temp_file() and/or + WHICH_PASS.register_temp_file_needed()).""" + + pass + + def check(self): + """Check that the options stored in SELF are sensible. + + This might including the existence of a repository on disk, etc.""" + + raise NotImplementedError() + + def check_symbols(self, symbol_map): + """Check that the symbols in SYMBOL_MAP are OK for this output option. + + SYMBOL_MAP is a map {AbstractSymbol : (Trunk|TypedSymbol)}, + indicating how each symbol is planned to be converted. Raise a + FatalError if the symbol plan is not acceptable for this output + option.""" + + raise NotImplementedError() + + def setup(self, svn_rev_count): + """Prepare this output option.""" + + raise NotImplementedError() + + def process_initial_project_commit(self, svn_commit): + """Process SVN_COMMIT, which is an SVNInitialProjectCommit.""" + + raise NotImplementedError() + + def process_primary_commit(self, svn_commit): + """Process SVN_COMMIT, which is an SVNPrimaryCommit.""" + + raise NotImplementedError() + + def process_post_commit(self, svn_commit): + """Process SVN_COMMIT, which is an SVNPostCommit.""" + + raise NotImplementedError() + + def process_branch_commit(self, svn_commit): + """Process SVN_COMMIT, which is an SVNBranchCommit.""" + + raise NotImplementedError() + + def process_tag_commit(self, svn_commit): + """Process SVN_COMMIT, which is an SVNTagCommit.""" + + raise NotImplementedError() + + def cleanup(self): + """Perform any required cleanup related to this output option.""" + + raise NotImplementedError() + + diff --git a/cvs2svn_lib/pass_manager.py b/cvs2svn_lib/pass_manager.py new file mode 100644 index 0000000..90fa2dc --- /dev/null +++ b/cvs2svn_lib/pass_manager.py @@ -0,0 +1,215 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains tools to manage the passes of a conversion.""" + + +import time +import gc + +from cvs2svn_lib import config +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.log import Log +from cvs2svn_lib.stats_keeper import StatsKeeper +from cvs2svn_lib.stats_keeper import read_stats_keeper +from cvs2svn_lib.artifact_manager import artifact_manager + + +class InvalidPassError(FatalError): + def __init__(self, msg): + FatalError.__init__( + self, msg + '\nUse --help-passes for more information.') + + +def check_for_garbage(): + # We've turned off the garbage collector because we shouldn't + # need it (we don't create circular dependencies) and because it + # is therefore a waste of time. So here we check for any + # unreachable objects and generate a debug-level warning if any + # occur: + gc.set_debug(gc.DEBUG_SAVEALL) + gc_count = gc.collect() + if gc_count: + if Log().is_on(Log.DEBUG): + Log().debug( + 'INTERNAL: %d unreachable object(s) were garbage collected:' + % (gc_count,) + ) + for g in gc.garbage: + Log().debug(' %s' % (g,)) + del gc.garbage[:] + + +class Pass(object): + """Base class for one step of the conversion.""" + + def __init__(self): + # By default, use the pass object's class name as the pass name: + self.name = self.__class__.__name__ + + def register_artifacts(self): + """Register artifacts (created and needed) in artifact_manager.""" + + raise NotImplementedError + + def _register_temp_file(self, basename): + """Helper method; for brevity only.""" + + artifact_manager.register_temp_file(basename, self) + + def _register_temp_file_needed(self, basename): + """Helper method; for brevity only.""" + + artifact_manager.register_temp_file_needed(basename, self) + + def run(self, run_options, stats_keeper): + """Carry out this step of the conversion. + + RUN_OPTIONS is an instance of RunOptions. STATS_KEEPER is an + instance of StatsKeeper.""" + + raise NotImplementedError + + +class PassManager: + """Manage a list of passes that can be executed separately or all at once. + + Passes are numbered starting with 1.""" + + def __init__(self, passes): + """Construct a PassManager with the specified PASSES. + + Internally, passes are numbered starting with 1. So PASSES[0] is + considered to be pass number 1.""" + + self.passes = passes + self.num_passes = len(self.passes) + + def get_pass_number(self, pass_name, default=None): + """Return the number of the pass indicated by PASS_NAME. + + PASS_NAME should be a string containing the name or number of a + pass. If a number, it should be in the range 1 <= value <= + self.num_passes. Return an integer in the same range. If + PASS_NAME is the empty string and DEFAULT is specified, return + DEFAULT. Raise InvalidPassError if PASS_NAME cannot be converted + into a valid pass number.""" + + if not pass_name and default is not None: + assert 1 <= default <= self.num_passes + return default + + try: + # Does pass_name look like an integer? + pass_number = int(pass_name) + if not 1 <= pass_number <= self.num_passes: + raise InvalidPassError( + 'illegal value (%d) for pass number. Must be 1 through %d or\n' + 'the name of a known pass.' + % (pass_number,self.num_passes,)) + return pass_number + except ValueError: + # Is pass_name the name of one of the passes? + for (i, the_pass) in enumerate(self.passes): + if the_pass.name == pass_name: + return i + 1 + raise InvalidPassError('Unknown pass name (%r).' % (pass_name,)) + + def run(self, run_options): + """Run the specified passes, one after another. + + RUN_OPTIONS will be passed to the Passes' run() methods. + RUN_OPTIONS.start_pass is the number of the first pass that should + be run. RUN_OPTIONS.end_pass is the number of the last pass that + should be run. It must be that 1 <= RUN_OPTIONS.start_pass <= + RUN_OPTIONS.end_pass <= self.num_passes.""" + + # Convert start_pass and end_pass into the indices of the passes + # to execute, using the Python index range convention (i.e., first + # pass executed and first pass *after* the ones that should be + # executed). + index_start = run_options.start_pass - 1 + index_end = run_options.end_pass + + # Inform the artifact manager when artifacts are created and used: + for (i, the_pass) in enumerate(self.passes): + the_pass.register_artifacts() + # Each pass creates a new version of the statistics file: + artifact_manager.register_temp_file( + config.STATISTICS_FILE % (i + 1,), the_pass + ) + if i != 0: + # Each pass subsequent to the first reads the statistics file + # from the preceding pass: + artifact_manager.register_temp_file_needed( + config.STATISTICS_FILE % (i + 1 - 1,), the_pass + ) + + # Tell the artifact manager about passes that are being skipped this run: + for the_pass in self.passes[0:index_start]: + artifact_manager.pass_skipped(the_pass) + + start_time = time.time() + for i in range(index_start, index_end): + the_pass = self.passes[i] + Log().quiet('----- pass %d (%s) -----' % (i + 1, the_pass.name,)) + artifact_manager.pass_started(the_pass) + + if i == 0: + stats_keeper = StatsKeeper() + else: + stats_keeper = read_stats_keeper( + artifact_manager.get_temp_file( + config.STATISTICS_FILE % (i + 1 - 1,) + ) + ) + + the_pass.run(run_options, stats_keeper) + end_time = time.time() + stats_keeper.log_duration_for_pass( + end_time - start_time, i + 1, the_pass.name + ) + Log().normal(stats_keeper.single_pass_timing(i + 1)) + stats_keeper.archive( + artifact_manager.get_temp_file(config.STATISTICS_FILE % (i + 1,)) + ) + start_time = end_time + Ctx().clean() + # Allow the artifact manager to clean up artifacts that are no + # longer needed: + artifact_manager.pass_done(the_pass, Ctx().skip_cleanup) + + check_for_garbage() + + # Tell the artifact manager about passes that are being deferred: + for the_pass in self.passes[index_end:]: + artifact_manager.pass_deferred(the_pass) + + Log().quiet(stats_keeper) + Log().normal(stats_keeper.timings()) + + # Consistency check: + artifact_manager.check_clean() + + def help_passes(self): + """Output (to sys.stdout) the indices and names of available passes.""" + + print 'PASSES:' + for (i, the_pass) in enumerate(self.passes): + print '%5d : %s' % (i + 1, the_pass.name,) + + diff --git a/cvs2svn_lib/passes.py b/cvs2svn_lib/passes.py new file mode 100644 index 0000000..af14692 --- /dev/null +++ b/cvs2svn_lib/passes.py @@ -0,0 +1,1837 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module defines the passes that make up a conversion.""" + + +import sys +import os +import shutil +import cPickle + +from cvs2svn_lib import config +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import FatalException +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.common import DB_OPEN_WRITE +from cvs2svn_lib.common import Timestamper +from cvs2svn_lib.log import Log +from cvs2svn_lib.pass_manager import Pass +from cvs2svn_lib.serializer import PrimedPickleSerializer +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.cvs_file_database import CVSFileDatabase +from cvs2svn_lib.metadata_database import MetadataDatabase +from cvs2svn_lib.project import read_projects +from cvs2svn_lib.project import write_projects +from cvs2svn_lib.symbol import LineOfDevelopment +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.symbol import Symbol +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag +from cvs2svn_lib.symbol import ExcludedSymbol +from cvs2svn_lib.symbol_database import SymbolDatabase +from cvs2svn_lib.symbol_database import create_symbol_database +from cvs2svn_lib.symbol_statistics import SymbolPlanError +from cvs2svn_lib.symbol_statistics import IndeterminateSymbolException +from cvs2svn_lib.symbol_statistics import SymbolStatistics +from cvs2svn_lib.cvs_item import CVSRevision +from cvs2svn_lib.cvs_item import CVSSymbol +from cvs2svn_lib.cvs_item_database import OldCVSItemStore +from cvs2svn_lib.cvs_item_database import IndexedCVSItemStore +from cvs2svn_lib.cvs_item_database import cvs_item_primer +from cvs2svn_lib.cvs_item_database import NewSortableCVSRevisionDatabase +from cvs2svn_lib.cvs_item_database import OldSortableCVSRevisionDatabase +from cvs2svn_lib.cvs_item_database import NewSortableCVSSymbolDatabase +from cvs2svn_lib.cvs_item_database import OldSortableCVSSymbolDatabase +from cvs2svn_lib.key_generator import KeyGenerator +from cvs2svn_lib.changeset import RevisionChangeset +from cvs2svn_lib.changeset import OrderedChangeset +from cvs2svn_lib.changeset import SymbolChangeset +from cvs2svn_lib.changeset import BranchChangeset +from cvs2svn_lib.changeset import create_symbol_changeset +from cvs2svn_lib.changeset_graph import ChangesetGraph +from cvs2svn_lib.changeset_graph_link import ChangesetGraphLink +from cvs2svn_lib.changeset_database import ChangesetDatabase +from cvs2svn_lib.changeset_database import CVSItemToChangesetTable +from cvs2svn_lib.svn_commit import SVNRevisionCommit +from cvs2svn_lib.openings_closings import SymbolingsLogger +from cvs2svn_lib.svn_commit_creator import SVNCommitCreator +from cvs2svn_lib.persistence_manager import PersistenceManager +from cvs2svn_lib.collect_data import CollectData +from cvs2svn_lib.process import call_command +from cvs2svn_lib.check_dependencies_pass \ + import CheckItemStoreDependenciesPass +from cvs2svn_lib.check_dependencies_pass \ + import CheckIndexedItemStoreDependenciesPass + + +def sort_file(infilename, outfilename, options=[]): + """Sort file INFILENAME, storing the results to OUTFILENAME. + + OPTIONS is an optional list of strings that are passed as additional + options to the sort command.""" + + # GNU sort will sort our dates differently (incorrectly!) if our + # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set + # it to 'C' + lc_all_tmp = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'C' + + # The -T option to sort has a nice side effect. The Win32 sort is + # case insensitive and cannot be used, and since it does not + # understand the -T option and dies if we try to use it, there is no + # risk that we use that sort by accident. + command = [ + Ctx().sort_executable, + '-T', Ctx().tmpdir + ] + options + [ + infilename + ] + + try: + # Under Windows, the subprocess module uses the Win32 + # CreateProcess, which always looks in the Windows system32 + # directory before it looks in the directories listed in the PATH + # environment variable. Since the Windows sort.exe is in the + # system32 directory it will always be chosen. A simple + # workaround is to launch the sort in a shell. When the shell + # (cmd.exe) searches it only examines the directories in the PATH + # so putting the directory with GNU sort ahead of the Windows + # system32 directory will cause GNU sort to be chosen. + call_command( + command, stdout=open(outfilename, 'w'), shell=(sys.platform=='win32') + ) + finally: + if lc_all_tmp is None: + del os.environ['LC_ALL'] + else: + os.environ['LC_ALL'] = lc_all_tmp + + # On some versions of Windows, os.system() does not return an error + # if the command fails. So add little consistency tests here that + # the output file was created and has the right size: + + if not os.path.exists(outfilename): + raise FatalError('Sort output file missing: %r' % (outfilename,)) + + if os.path.getsize(outfilename) != os.path.getsize(infilename): + raise FatalError( + 'Sort input and output file sizes differ:\n' + ' %r (%d bytes)\n' + ' %r (%d bytes)' % ( + infilename, os.path.getsize(infilename), + outfilename, os.path.getsize(outfilename), + ) + ) + + +class CollectRevsPass(Pass): + """This pass was formerly known as pass1.""" + + def register_artifacts(self): + self._register_temp_file(config.PROJECTS) + self._register_temp_file(config.SYMBOL_STATISTICS) + self._register_temp_file(config.METADATA_INDEX_TABLE) + self._register_temp_file(config.METADATA_STORE) + self._register_temp_file(config.CVS_FILES_DB) + self._register_temp_file(config.CVS_ITEMS_STORE) + Ctx().revision_recorder.register_artifacts(self) + + def run(self, run_options, stats_keeper): + Log().quiet("Examining all CVS ',v' files...") + Ctx()._projects = {} + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_NEW) + cd = CollectData(Ctx().revision_recorder, stats_keeper) + for project in run_options.projects: + cd.process_project(project) + run_options.projects = None + + fatal_errors = cd.close() + + if fatal_errors: + raise FatalException("Pass 1 complete.\n" + + "=" * 75 + "\n" + + "Error summary:\n" + + "\n".join(fatal_errors) + "\n" + + "Exited due to fatal error(s).") + + Ctx()._cvs_file_db.close() + write_projects(artifact_manager.get_temp_file(config.PROJECTS)) + Log().quiet("Done") + + +class CleanMetadataPass(Pass): + """Clean up CVS revision metadata and write it to a new database.""" + + def register_artifacts(self): + self._register_temp_file(config.METADATA_CLEAN_INDEX_TABLE) + self._register_temp_file(config.METADATA_CLEAN_STORE) + self._register_temp_file_needed(config.METADATA_INDEX_TABLE) + self._register_temp_file_needed(config.METADATA_STORE) + + def _get_clean_author(self, author): + """Return AUTHOR, converted appropriately to UTF8. + + Raise a UnicodeException if it cannot be converted using the + configured cvs_author_decoder.""" + + try: + return self._authors[author] + except KeyError: + pass + + try: + clean_author = Ctx().cvs_author_decoder(author) + except UnicodeError: + self._authors[author] = author + raise UnicodeError('Problem decoding author \'%s\'' % (author,)) + + try: + clean_author = clean_author.encode('utf8') + except UnicodeError: + self._authors[author] = author + raise UnicodeError('Problem encoding author \'%s\'' % (author,)) + + self._authors[author] = clean_author + return clean_author + + def _get_clean_log_msg(self, log_msg): + """Return LOG_MSG, converted appropriately to UTF8. + + Raise a UnicodeException if it cannot be converted using the + configured cvs_log_decoder.""" + + try: + clean_log_msg = Ctx().cvs_log_decoder(log_msg) + except UnicodeError: + raise UnicodeError( + 'Problem decoding log message:\n' + '%s\n' + '%s\n' + '%s' + % ('-' * 75, log_msg, '-' * 75,) + ) + + try: + return clean_log_msg.encode('utf8') + except UnicodeError: + raise UnicodeError( + 'Problem encoding log message:\n' + '%s\n' + '%s\n' + '%s' + % ('-' * 75, log_msg, '-' * 75,) + ) + + def _clean_metadata(self, metadata): + """Clean up METADATA by overwriting its members as necessary.""" + + try: + metadata.author = self._get_clean_author(metadata.author) + except UnicodeError, e: + Log().warn('%s: %s' % (warning_prefix, e,)) + self.warnings = True + + try: + metadata.log_msg = self._get_clean_log_msg(metadata.log_msg) + except UnicodeError, e: + Log().warn('%s: %s' % (warning_prefix, e,)) + self.warnings = True + + def run(self, run_options, stats_keeper): + Log().quiet("Converting metadata to UTF8...") + metadata_db = MetadataDatabase( + artifact_manager.get_temp_file(config.METADATA_STORE), + artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE), + DB_OPEN_READ, + ) + metadata_clean_db = MetadataDatabase( + artifact_manager.get_temp_file(config.METADATA_CLEAN_STORE), + artifact_manager.get_temp_file(config.METADATA_CLEAN_INDEX_TABLE), + DB_OPEN_NEW, + ) + + self.warnings = False + + # A map {author : clean_author} for those known (to avoid + # repeating warnings): + self._authors = {} + + for id in metadata_db.iterkeys(): + metadata = metadata_db[id] + + # Record the original author name because it might be needed for + # expanding CVS keywords: + metadata.original_author = metadata.author + + self._clean_metadata(metadata) + + metadata_clean_db[id] = metadata + + if self.warnings: + raise FatalError( + 'There were warnings converting author names and/or log messages\n' + 'to Unicode (see messages above). Please restart this pass\n' + 'with one or more \'--encoding\' parameters or with\n' + '\'--fallback-encoding\'.' + ) + + metadata_clean_db.close() + metadata_db.close() + Log().quiet("Done") + + +class CollateSymbolsPass(Pass): + """Divide symbols into branches, tags, and excludes.""" + + conversion_names = { + Trunk : 'trunk', + Branch : 'branch', + Tag : 'tag', + ExcludedSymbol : 'exclude', + Symbol : '.', + } + + def register_artifacts(self): + self._register_temp_file(config.SYMBOL_DB) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_STATISTICS) + + def get_symbol(self, run_options, stats): + """Use StrategyRules to decide what to do with a symbol. + + STATS is an instance of symbol_statistics._Stats describing an + instance of Symbol or Trunk. To determine how the symbol is to be + converted, consult the StrategyRules in the project's + symbol_strategy_rules. Each rule is allowed a chance to change + the way the symbol will be converted. If the symbol is not a + Trunk or TypedSymbol after all rules have run, raise + IndeterminateSymbolException.""" + + symbol = stats.lod + rules = run_options.project_symbol_strategy_rules[symbol.project.id] + for rule in rules: + symbol = rule.get_symbol(symbol, stats) + assert symbol is not None + + stats.check_valid(symbol) + + return symbol + + def log_symbol_summary(self, stats, symbol): + if not self.symbol_info_file: + return + + if isinstance(symbol, Trunk): + name = '.trunk.' + preferred_parent_name = '.' + else: + name = stats.lod.name + if symbol.preferred_parent_id is None: + preferred_parent_name = '.' + else: + preferred_parent = self.symbol_stats[symbol.preferred_parent_id].lod + if isinstance(preferred_parent, Trunk): + preferred_parent_name = '.trunk.' + else: + preferred_parent_name = preferred_parent.name + + if isinstance(symbol, LineOfDevelopment) and symbol.base_path: + symbol_path = symbol.base_path + else: + symbol_path = '.' + + self.symbol_info_file.write( + '%-5d %-30s %-10s %s %s\n' % ( + stats.lod.project.id, + name, + self.conversion_names[symbol.__class__], + symbol_path, + preferred_parent_name, + ) + ) + self.symbol_info_file.write(' # %s\n' % (stats,)) + parent_counts = stats.possible_parents.items() + if parent_counts: + self.symbol_info_file.write(' # Possible parents:\n') + parent_counts.sort(lambda a,b: cmp((b[1], a[0]), (a[1], b[0]))) + for (pp, count) in parent_counts: + if isinstance(pp, Trunk): + self.symbol_info_file.write( + ' # .trunk. : %d\n' % (count,) + ) + else: + self.symbol_info_file.write( + ' # %s : %d\n' % (pp.name, count,) + ) + + def get_symbols(self, run_options): + """Return a map telling how to convert symbols. + + The return value is a map {AbstractSymbol : (Trunk|TypedSymbol)}, + indicating how each symbol should be converted. Trunk objects in + SYMBOL_STATS are passed through unchanged. One object is included + in the return value for each line of development described in + SYMBOL_STATS. + + Raise FatalError if there was an error.""" + + errors = [] + mismatches = [] + + if Ctx().symbol_info_filename is not None: + self.symbol_info_file = open(Ctx().symbol_info_filename, 'w') + self.symbol_info_file.write( + '# Columns: project_id symbol_name conversion symbol_path ' + 'preferred_parent_name\n' + ) + else: + self.symbol_info_file = None + + # Initialize each symbol strategy rule a single time, even if it + # is used in more than one project. First define a map from + # object id to symbol strategy rule: + rules = {} + for rule_list in run_options.project_symbol_strategy_rules: + for rule in rule_list: + rules[id(rule)] = rule + + for rule in rules.itervalues(): + rule.start(self.symbol_stats) + + retval = {} + + for stats in self.symbol_stats: + try: + symbol = self.get_symbol(run_options, stats) + except IndeterminateSymbolException, e: + self.log_symbol_summary(stats, stats.lod) + mismatches.append(e.stats) + except SymbolPlanError, e: + self.log_symbol_summary(stats, stats.lod) + errors.append(e) + else: + self.log_symbol_summary(stats, symbol) + retval[stats.lod] = symbol + + for rule in rules.itervalues(): + rule.finish() + + if self.symbol_info_file: + self.symbol_info_file.close() + + del self.symbol_info_file + + if errors or mismatches: + s = ['Problems determining how symbols should be converted:\n'] + for e in errors: + s.append('%s\n' % (e,)) + if mismatches: + s.append( + 'It is not clear how the following symbols ' + 'should be converted.\n' + 'Use --symbol-hints, --force-tag, --force-branch, --exclude, ' + 'and/or\n' + '--symbol-default to resolve the ambiguity.\n' + ) + for stats in mismatches: + s.append(' %s\n' % (stats,)) + raise FatalError(''.join(s)) + else: + return retval + + def run(self, run_options, stats_keeper): + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + self.symbol_stats = SymbolStatistics( + artifact_manager.get_temp_file(config.SYMBOL_STATISTICS) + ) + + symbol_map = self.get_symbols(run_options) + + # Check the symbols for consistency and bail out if there were errors: + self.symbol_stats.check_consistency(symbol_map) + + # Check that the symbols all have SVN paths set and that the paths + # are disjoint: + Ctx().output_option.check_symbols(symbol_map) + + for symbol in symbol_map.itervalues(): + if isinstance(symbol, ExcludedSymbol): + self.symbol_stats.exclude_symbol(symbol) + + create_symbol_database(symbol_map.values()) + + del self.symbol_stats + + Log().quiet("Done") + + +class FilterSymbolsPass(Pass): + """Delete any branches/tags that are to be excluded. + + Also delete revisions on excluded branches, and delete other + references to the excluded symbols.""" + + def register_artifacts(self): + self._register_temp_file(config.SUMMARY_SERIALIZER) + self._register_temp_file(config.CVS_REVS_SUMMARY_DATAFILE) + self._register_temp_file(config.CVS_SYMBOLS_SUMMARY_DATAFILE) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_STORE) + Ctx().revision_excluder.register_artifacts(self) + + def run(self, run_options, stats_keeper): + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + cvs_item_store = OldCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_STORE)) + + cvs_item_serializer = PrimedPickleSerializer(cvs_item_primer) + f = open(artifact_manager.get_temp_file(config.SUMMARY_SERIALIZER), 'wb') + cPickle.dump(cvs_item_serializer, f, -1) + f.close() + + rev_db = NewSortableCVSRevisionDatabase( + artifact_manager.get_temp_file(config.CVS_REVS_SUMMARY_DATAFILE), + cvs_item_serializer, + ) + + symbol_db = NewSortableCVSSymbolDatabase( + artifact_manager.get_temp_file(config.CVS_SYMBOLS_SUMMARY_DATAFILE), + cvs_item_serializer, + ) + + revision_excluder = Ctx().revision_excluder + + Log().quiet("Filtering out excluded symbols and summarizing items...") + + stats_keeper.reset_cvs_rev_info() + revision_excluder.start() + + # Process the cvs items store one file at a time: + for cvs_file_items in cvs_item_store.iter_cvs_file_items(): + Log().verbose(cvs_file_items.cvs_file.filename) + cvs_file_items.filter_excluded_symbols(revision_excluder) + cvs_file_items.mutate_symbols() + cvs_file_items.adjust_parents() + cvs_file_items.refine_symbols() + cvs_file_items.record_opened_symbols() + cvs_file_items.record_closed_symbols() + cvs_file_items.check_link_consistency() + + # Store whatever is left to the new file and update statistics: + stats_keeper.record_cvs_file(cvs_file_items.cvs_file) + for cvs_item in cvs_file_items.values(): + stats_keeper.record_cvs_item(cvs_item) + + if isinstance(cvs_item, CVSRevision): + rev_db.add(cvs_item) + elif isinstance(cvs_item, CVSSymbol): + symbol_db.add(cvs_item) + + stats_keeper.set_stats_reflect_exclude(True) + + rev_db.close() + symbol_db.close() + revision_excluder.finish() + cvs_item_store.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + Log().quiet("Done") + + +class SortRevisionSummaryPass(Pass): + """Sort the revision summary file.""" + + def register_artifacts(self): + self._register_temp_file(config.CVS_REVS_SUMMARY_SORTED_DATAFILE) + self._register_temp_file_needed(config.CVS_REVS_SUMMARY_DATAFILE) + + def run(self, run_options, stats_keeper): + Log().quiet("Sorting CVS revision summaries...") + sort_file( + artifact_manager.get_temp_file(config.CVS_REVS_SUMMARY_DATAFILE), + artifact_manager.get_temp_file( + config.CVS_REVS_SUMMARY_SORTED_DATAFILE)) + Log().quiet("Done") + + +class SortSymbolSummaryPass(Pass): + """Sort the symbol summary file.""" + + def register_artifacts(self): + self._register_temp_file(config.CVS_SYMBOLS_SUMMARY_SORTED_DATAFILE) + self._register_temp_file_needed(config.CVS_SYMBOLS_SUMMARY_DATAFILE) + + def run(self, run_options, stats_keeper): + Log().quiet("Sorting CVS symbol summaries...") + sort_file( + artifact_manager.get_temp_file(config.CVS_SYMBOLS_SUMMARY_DATAFILE), + artifact_manager.get_temp_file( + config.CVS_SYMBOLS_SUMMARY_SORTED_DATAFILE)) + Log().quiet("Done") + + +class InitializeChangesetsPass(Pass): + """Create preliminary CommitSets.""" + + def register_artifacts(self): + self._register_temp_file(config.CVS_ITEM_TO_CHANGESET) + self._register_temp_file(config.CHANGESETS_STORE) + self._register_temp_file(config.CHANGESETS_INDEX) + self._register_temp_file(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.SUMMARY_SERIALIZER) + self._register_temp_file_needed(config.CVS_REVS_SUMMARY_SORTED_DATAFILE) + self._register_temp_file_needed( + config.CVS_SYMBOLS_SUMMARY_SORTED_DATAFILE) + + def get_revision_changesets(self): + """Generate revision changesets, one at a time. + + Each time, yield a list of CVSRevisions that might potentially + consititute a changeset.""" + + # Create changesets for CVSRevisions: + old_metadata_id = None + old_timestamp = None + changeset_items = [] + + db = OldSortableCVSRevisionDatabase( + artifact_manager.get_temp_file( + config.CVS_REVS_SUMMARY_SORTED_DATAFILE + ), + self.cvs_item_serializer, + ) + + for cvs_rev in db: + if cvs_rev.metadata_id != old_metadata_id \ + or cvs_rev.timestamp > old_timestamp + config.COMMIT_THRESHOLD: + # Start a new changeset. First finish up the old changeset, + # if any: + if changeset_items: + yield changeset_items + changeset_items = [] + old_metadata_id = cvs_rev.metadata_id + changeset_items.append(cvs_rev) + old_timestamp = cvs_rev.timestamp + + # Finish up the last changeset, if any: + if changeset_items: + yield changeset_items + + def get_symbol_changesets(self): + """Generate symbol changesets, one at a time. + + Each time, yield a list of CVSSymbols that might potentially + consititute a changeset.""" + + old_symbol_id = None + changeset_items = [] + + db = OldSortableCVSSymbolDatabase( + artifact_manager.get_temp_file( + config.CVS_SYMBOLS_SUMMARY_SORTED_DATAFILE + ), + self.cvs_item_serializer, + ) + + for cvs_symbol in db: + if cvs_symbol.symbol.id != old_symbol_id: + # Start a new changeset. First finish up the old changeset, + # if any: + if changeset_items: + yield changeset_items + changeset_items = [] + old_symbol_id = cvs_symbol.symbol.id + changeset_items.append(cvs_symbol) + + # Finish up the last changeset, if any: + if changeset_items: + yield changeset_items + + @staticmethod + def compare_items(a, b): + return ( + cmp(a.timestamp, b.timestamp) + or cmp(a.cvs_file.cvs_path, b.cvs_file.cvs_path) + or cmp([int(x) for x in a.rev.split('.')], + [int(x) for x in b.rev.split('.')]) + or cmp(a.id, b.id)) + + def break_internal_dependencies(self, changeset_items): + """Split up CHANGESET_ITEMS if necessary to break internal dependencies. + + CHANGESET_ITEMS is a list of CVSRevisions that could possibly + belong in a single RevisionChangeset, but there might be internal + dependencies among the items. Return a list of lists, where each + sublist is a list of CVSRevisions and at least one internal + dependency has been eliminated. Iff CHANGESET_ITEMS does not have + to be split, then the return value will contain a single value, + namely the original value of CHANGESET_ITEMS. Split + CHANGESET_ITEMS at most once, even though the resulting changesets + might themselves have internal dependencies.""" + + # We only look for succ dependencies, since by doing so we + # automatically cover pred dependencies as well. First create a + # list of tuples (pred, succ) of id pairs for CVSItems that depend + # on each other. + dependencies = [] + changeset_cvs_item_ids = set([cvs_rev.id for cvs_rev in changeset_items]) + for cvs_item in changeset_items: + for next_id in cvs_item.get_succ_ids(): + if next_id in changeset_cvs_item_ids: + # Sanity check: a CVSItem should never depend on itself: + if next_id == cvs_item.id: + raise InternalError('Item depends on itself: %s' % (cvs_item,)) + + dependencies.append((cvs_item.id, next_id,)) + + if dependencies: + # Sort the changeset_items in a defined order (chronological to the + # extent that the timestamps are correct and unique). + changeset_items.sort(self.compare_items) + indexes = {} + for (i, changeset_item) in enumerate(changeset_items): + indexes[changeset_item.id] = i + # How many internal dependencies would be broken by breaking the + # Changeset after a particular index? + breaks = [0] * len(changeset_items) + for (pred, succ,) in dependencies: + pred_index = indexes[pred] + succ_index = indexes[succ] + breaks[min(pred_index, succ_index)] += 1 + breaks[max(pred_index, succ_index)] -= 1 + best_i = None + best_count = -1 + best_time = 0 + for i in range(1, len(breaks)): + breaks[i] += breaks[i - 1] + for i in range(0, len(breaks) - 1): + if breaks[i] > best_count: + best_i = i + best_count = breaks[i] + best_time = (changeset_items[i + 1].timestamp + - changeset_items[i].timestamp) + elif breaks[i] == best_count \ + and (changeset_items[i + 1].timestamp + - changeset_items[i].timestamp) < best_time: + best_i = i + best_count = breaks[i] + best_time = (changeset_items[i + 1].timestamp + - changeset_items[i].timestamp) + # Reuse the old changeset.id for the first of the split changesets. + return [changeset_items[:best_i + 1], changeset_items[best_i + 1:]] + else: + return [changeset_items] + + def break_all_internal_dependencies(self, changeset_items): + """Keep breaking CHANGESET_ITEMS up to break all internal dependencies. + + CHANGESET_ITEMS is a list of CVSRevisions that could conceivably + be part of a single changeset. Break this list into sublists, + where the CVSRevisions in each sublist are free of mutual + dependencies.""" + + # This method is written non-recursively to avoid any possible + # problems with recursion depth. + + changesets_to_split = [changeset_items] + while changesets_to_split: + changesets = self.break_internal_dependencies(changesets_to_split.pop()) + if len(changesets) == 1: + [changeset_items] = changesets + yield changeset_items + else: + # The changeset had to be split; see if either of the + # fragments have to be split: + changesets.reverse() + changesets_to_split.extend(changesets) + + def get_changesets(self): + """Generate (Changeset, [CVSItem,...]) for all changesets. + + The Changesets already have their internal dependencies broken. + The [CVSItem,...] list is the list of CVSItems in the + corresponding Changeset.""" + + for changeset_items in self.get_revision_changesets(): + for split_changeset_items \ + in self.break_all_internal_dependencies(changeset_items): + yield ( + RevisionChangeset( + self.changeset_key_generator.gen_id(), + [cvs_rev.id for cvs_rev in split_changeset_items] + ), + split_changeset_items, + ) + + for changeset_items in self.get_symbol_changesets(): + yield ( + create_symbol_changeset( + self.changeset_key_generator.gen_id(), + changeset_items[0].symbol, + [cvs_symbol.id for cvs_symbol in changeset_items] + ), + changeset_items, + ) + + def run(self, run_options, stats_keeper): + Log().quiet("Creating preliminary commit sets...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + + f = open(artifact_manager.get_temp_file(config.SUMMARY_SERIALIZER), 'rb') + self.cvs_item_serializer = cPickle.load(f) + f.close() + + changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_INDEX), + DB_OPEN_NEW, + ) + cvs_item_to_changeset_id = CVSItemToChangesetTable( + artifact_manager.get_temp_file(config.CVS_ITEM_TO_CHANGESET), + DB_OPEN_NEW, + ) + + self.sorted_cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_NEW) + + self.changeset_key_generator = KeyGenerator() + + for (changeset, changeset_items) in self.get_changesets(): + if Log().is_on(Log.DEBUG): + Log().debug(repr(changeset)) + changeset_db.store(changeset) + for cvs_item in changeset_items: + self.sorted_cvs_items_db.add(cvs_item) + cvs_item_to_changeset_id[cvs_item.id] = changeset.id + + self.sorted_cvs_items_db.close() + cvs_item_to_changeset_id.close() + changeset_db.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + del self.cvs_item_serializer + + Log().quiet("Done") + + +class ProcessedChangesetLogger: + def __init__(self): + self.processed_changeset_ids = [] + + def log(self, changeset_id): + if Log().is_on(Log.DEBUG): + self.processed_changeset_ids.append(changeset_id) + + def flush(self): + if self.processed_changeset_ids: + Log().debug( + 'Consumed changeset ids %s' + % (', '.join(['%x' % id for id in self.processed_changeset_ids]),)) + + del self.processed_changeset_ids[:] + + +class BreakRevisionChangesetCyclesPass(Pass): + """Break up any dependency cycles involving only RevisionChangesets.""" + + def register_artifacts(self): + self._register_temp_file(config.CHANGESETS_REVBROKEN_STORE) + self._register_temp_file(config.CHANGESETS_REVBROKEN_INDEX) + self._register_temp_file(config.CVS_ITEM_TO_CHANGESET_REVBROKEN) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.CHANGESETS_STORE) + self._register_temp_file_needed(config.CHANGESETS_INDEX) + self._register_temp_file_needed(config.CVS_ITEM_TO_CHANGESET) + + def get_source_changesets(self): + old_changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_INDEX), + DB_OPEN_READ) + + changeset_ids = old_changeset_db.keys() + + for changeset_id in changeset_ids: + yield old_changeset_db[changeset_id] + + old_changeset_db.close() + del old_changeset_db + + def break_cycle(self, cycle): + """Break up one or more changesets in CYCLE to help break the cycle. + + CYCLE is a list of Changesets where + + cycle[i] depends on cycle[i - 1] + + Break up one or more changesets in CYCLE to make progress towards + breaking the cycle. Update self.changeset_graph accordingly. + + It is not guaranteed that the cycle will be broken by one call to + this routine, but at least some progress must be made.""" + + self.processed_changeset_logger.flush() + best_i = None + best_link = None + for i in range(len(cycle)): + # It's OK if this index wraps to -1: + link = ChangesetGraphLink( + cycle[i - 1], cycle[i], cycle[i + 1 - len(cycle)]) + + if best_i is None or link < best_link: + best_i = i + best_link = link + + if Log().is_on(Log.DEBUG): + Log().debug( + 'Breaking cycle %s by breaking node %x' % ( + ' -> '.join(['%x' % node.id for node in (cycle + [cycle[0]])]), + best_link.changeset.id,)) + + new_changesets = best_link.break_changeset(self.changeset_key_generator) + + self.changeset_graph.delete_changeset(best_link.changeset) + + for changeset in new_changesets: + self.changeset_graph.add_new_changeset(changeset) + + def run(self, run_options, stats_keeper): + Log().quiet("Breaking revision changeset dependency cycles...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + + shutil.copyfile( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET), + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_REVBROKEN)) + cvs_item_to_changeset_id = CVSItemToChangesetTable( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_REVBROKEN), + DB_OPEN_WRITE) + + changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_REVBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_REVBROKEN_INDEX), + DB_OPEN_NEW) + + self.changeset_graph = ChangesetGraph( + changeset_db, cvs_item_to_changeset_id + ) + + max_changeset_id = 0 + for changeset in self.get_source_changesets(): + changeset_db.store(changeset) + if isinstance(changeset, RevisionChangeset): + self.changeset_graph.add_changeset(changeset) + max_changeset_id = max(max_changeset_id, changeset.id) + + self.changeset_key_generator = KeyGenerator(max_changeset_id + 1) + + self.processed_changeset_logger = ProcessedChangesetLogger() + + # Consume the graph, breaking cycles using self.break_cycle(): + for (changeset, time_range) in self.changeset_graph.consume_graph( + cycle_breaker=self.break_cycle + ): + self.processed_changeset_logger.log(changeset.id) + + self.processed_changeset_logger.flush() + del self.processed_changeset_logger + + self.changeset_graph.close() + self.changeset_graph = None + Ctx()._cvs_items_db.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + Log().quiet("Done") + + +class RevisionTopologicalSortPass(Pass): + """Sort RevisionChangesets into commit order. + + Also convert them to OrderedChangesets, without changing their ids.""" + + def register_artifacts(self): + self._register_temp_file(config.CHANGESETS_REVSORTED_STORE) + self._register_temp_file(config.CHANGESETS_REVSORTED_INDEX) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.CHANGESETS_REVBROKEN_STORE) + self._register_temp_file_needed(config.CHANGESETS_REVBROKEN_INDEX) + self._register_temp_file_needed(config.CVS_ITEM_TO_CHANGESET_REVBROKEN) + + def get_source_changesets(self, changeset_db): + changeset_ids = changeset_db.keys() + + for changeset_id in changeset_ids: + yield changeset_db[changeset_id] + + def get_changesets(self): + changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_REVBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_REVBROKEN_INDEX), + DB_OPEN_READ, + ) + + changeset_graph = ChangesetGraph( + changeset_db, + CVSItemToChangesetTable( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_REVBROKEN + ), + DB_OPEN_READ, + ) + ) + + for changeset in self.get_source_changesets(changeset_db): + if isinstance(changeset, RevisionChangeset): + changeset_graph.add_changeset(changeset) + else: + yield changeset + + changeset_ids = [] + + # Sentry: + changeset_ids.append(None) + + for (changeset, time_range) in changeset_graph.consume_graph(): + changeset_ids.append(changeset.id) + + # Sentry: + changeset_ids.append(None) + + for i in range(1, len(changeset_ids) - 1): + changeset = changeset_db[changeset_ids[i]] + yield OrderedChangeset( + changeset.id, changeset.cvs_item_ids, i - 1, + changeset_ids[i - 1], changeset_ids[i + 1]) + + changeset_graph.close() + + def run(self, run_options, stats_keeper): + Log().quiet("Generating CVSRevisions in commit order...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + + changesets_revordered_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_REVSORTED_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_REVSORTED_INDEX), + DB_OPEN_NEW) + + for changeset in self.get_changesets(): + changesets_revordered_db.store(changeset) + + changesets_revordered_db.close() + Ctx()._cvs_items_db.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + Log().quiet("Done") + + +class BreakSymbolChangesetCyclesPass(Pass): + """Break up any dependency cycles involving only SymbolChangesets.""" + + def register_artifacts(self): + self._register_temp_file(config.CHANGESETS_SYMBROKEN_STORE) + self._register_temp_file(config.CHANGESETS_SYMBROKEN_INDEX) + self._register_temp_file(config.CVS_ITEM_TO_CHANGESET_SYMBROKEN) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.CHANGESETS_REVSORTED_STORE) + self._register_temp_file_needed(config.CHANGESETS_REVSORTED_INDEX) + self._register_temp_file_needed(config.CVS_ITEM_TO_CHANGESET_REVBROKEN) + + def get_source_changesets(self): + old_changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_REVSORTED_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_REVSORTED_INDEX), + DB_OPEN_READ) + + changeset_ids = old_changeset_db.keys() + + for changeset_id in changeset_ids: + yield old_changeset_db[changeset_id] + + old_changeset_db.close() + + def break_cycle(self, cycle): + """Break up one or more changesets in CYCLE to help break the cycle. + + CYCLE is a list of Changesets where + + cycle[i] depends on cycle[i - 1] + + Break up one or more changesets in CYCLE to make progress towards + breaking the cycle. Update self.changeset_graph accordingly. + + It is not guaranteed that the cycle will be broken by one call to + this routine, but at least some progress must be made.""" + + self.processed_changeset_logger.flush() + best_i = None + best_link = None + for i in range(len(cycle)): + # It's OK if this index wraps to -1: + link = ChangesetGraphLink( + cycle[i - 1], cycle[i], cycle[i + 1 - len(cycle)]) + + if best_i is None or link < best_link: + best_i = i + best_link = link + + if Log().is_on(Log.DEBUG): + Log().debug( + 'Breaking cycle %s by breaking node %x' % ( + ' -> '.join(['%x' % node.id for node in (cycle + [cycle[0]])]), + best_link.changeset.id,)) + + new_changesets = best_link.break_changeset(self.changeset_key_generator) + + self.changeset_graph.delete_changeset(best_link.changeset) + + for changeset in new_changesets: + self.changeset_graph.add_new_changeset(changeset) + + def run(self, run_options, stats_keeper): + Log().quiet("Breaking symbol changeset dependency cycles...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + + shutil.copyfile( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_REVBROKEN), + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_SYMBROKEN)) + cvs_item_to_changeset_id = CVSItemToChangesetTable( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_SYMBROKEN), + DB_OPEN_WRITE) + + changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_SYMBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_SYMBROKEN_INDEX), + DB_OPEN_NEW) + + self.changeset_graph = ChangesetGraph( + changeset_db, cvs_item_to_changeset_id + ) + + max_changeset_id = 0 + for changeset in self.get_source_changesets(): + changeset_db.store(changeset) + if isinstance(changeset, SymbolChangeset): + self.changeset_graph.add_changeset(changeset) + max_changeset_id = max(max_changeset_id, changeset.id) + + self.changeset_key_generator = KeyGenerator(max_changeset_id + 1) + + self.processed_changeset_logger = ProcessedChangesetLogger() + + # Consume the graph, breaking cycles using self.break_cycle(): + for (changeset, time_range) in self.changeset_graph.consume_graph( + cycle_breaker=self.break_cycle + ): + self.processed_changeset_logger.log(changeset.id) + + self.processed_changeset_logger.flush() + del self.processed_changeset_logger + + self.changeset_graph.close() + self.changeset_graph = None + Ctx()._cvs_items_db.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + Log().quiet("Done") + + +class BreakAllChangesetCyclesPass(Pass): + """Break up any dependency cycles that are closed by SymbolChangesets.""" + + def register_artifacts(self): + self._register_temp_file(config.CHANGESETS_ALLBROKEN_STORE) + self._register_temp_file(config.CHANGESETS_ALLBROKEN_INDEX) + self._register_temp_file(config.CVS_ITEM_TO_CHANGESET_ALLBROKEN) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.CHANGESETS_SYMBROKEN_STORE) + self._register_temp_file_needed(config.CHANGESETS_SYMBROKEN_INDEX) + self._register_temp_file_needed(config.CVS_ITEM_TO_CHANGESET_SYMBROKEN) + + def get_source_changesets(self): + old_changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_SYMBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_SYMBROKEN_INDEX), + DB_OPEN_READ) + + changeset_ids = old_changeset_db.keys() + + for changeset_id in changeset_ids: + yield old_changeset_db[changeset_id] + + old_changeset_db.close() + + def _split_retrograde_changeset(self, changeset): + """CHANGESET is retrograde. Split it into non-retrograde changesets.""" + + Log().debug('Breaking retrograde changeset %x' % (changeset.id,)) + + self.changeset_graph.delete_changeset(changeset) + + # A map { cvs_branch_id : (max_pred_ordinal, min_succ_ordinal) } + ordinal_limits = {} + for cvs_branch in changeset.iter_cvs_items(): + max_pred_ordinal = 0 + min_succ_ordinal = sys.maxint + + for pred_id in cvs_branch.get_pred_ids(): + pred_ordinal = self.ordinals.get( + self.cvs_item_to_changeset_id[pred_id], 0) + max_pred_ordinal = max(max_pred_ordinal, pred_ordinal) + + for succ_id in cvs_branch.get_succ_ids(): + succ_ordinal = self.ordinals.get( + self.cvs_item_to_changeset_id[succ_id], sys.maxint) + min_succ_ordinal = min(min_succ_ordinal, succ_ordinal) + + assert max_pred_ordinal < min_succ_ordinal + ordinal_limits[cvs_branch.id] = (max_pred_ordinal, min_succ_ordinal,) + + # Find the earliest successor ordinal: + min_min_succ_ordinal = sys.maxint + for (max_pred_ordinal, min_succ_ordinal) in ordinal_limits.values(): + min_min_succ_ordinal = min(min_min_succ_ordinal, min_succ_ordinal) + + early_item_ids = [] + late_item_ids = [] + for (id, (max_pred_ordinal, min_succ_ordinal)) in ordinal_limits.items(): + if max_pred_ordinal >= min_min_succ_ordinal: + late_item_ids.append(id) + else: + early_item_ids.append(id) + + assert early_item_ids + assert late_item_ids + + early_changeset = changeset.create_split_changeset( + self.changeset_key_generator.gen_id(), early_item_ids) + late_changeset = changeset.create_split_changeset( + self.changeset_key_generator.gen_id(), late_item_ids) + + self.changeset_graph.add_new_changeset(early_changeset) + self.changeset_graph.add_new_changeset(late_changeset) + + early_split = self._split_if_retrograde(early_changeset.id) + + # Because of the way we constructed it, the early changeset should + # not have to be split: + assert not early_split + + self._split_if_retrograde(late_changeset.id) + + def _split_if_retrograde(self, changeset_id): + node = self.changeset_graph[changeset_id] + pred_ordinals = [ + self.ordinals[id] + for id in node.pred_ids + if id in self.ordinals + ] + pred_ordinals.sort() + succ_ordinals = [ + self.ordinals[id] + for id in node.succ_ids + if id in self.ordinals + ] + succ_ordinals.sort() + if pred_ordinals and succ_ordinals \ + and pred_ordinals[-1] >= succ_ordinals[0]: + self._split_retrograde_changeset(self.changeset_db[node.id]) + return True + else: + return False + + def break_segment(self, segment): + """Break a changeset in SEGMENT[1:-1]. + + The range SEGMENT[1:-1] is not empty, and all of the changesets in + that range are SymbolChangesets.""" + + best_i = None + best_link = None + for i in range(1, len(segment) - 1): + link = ChangesetGraphLink(segment[i - 1], segment[i], segment[i + 1]) + + if best_i is None or link < best_link: + best_i = i + best_link = link + + if Log().is_on(Log.DEBUG): + Log().debug( + 'Breaking segment %s by breaking node %x' % ( + ' -> '.join(['%x' % node.id for node in segment]), + best_link.changeset.id,)) + + new_changesets = best_link.break_changeset(self.changeset_key_generator) + + self.changeset_graph.delete_changeset(best_link.changeset) + + for changeset in new_changesets: + self.changeset_graph.add_new_changeset(changeset) + + def break_cycle(self, cycle): + """Break up one or more SymbolChangesets in CYCLE to help break the cycle. + + CYCLE is a list of SymbolChangesets where + + cycle[i] depends on cycle[i - 1] + + . Break up one or more changesets in CYCLE to make progress + towards breaking the cycle. Update self.changeset_graph + accordingly. + + It is not guaranteed that the cycle will be broken by one call to + this routine, but at least some progress must be made.""" + + if Log().is_on(Log.DEBUG): + Log().debug( + 'Breaking cycle %s' % ( + ' -> '.join(['%x' % changeset.id + for changeset in cycle + [cycle[0]]]),)) + + # Unwrap the cycle into a segment then break the segment: + self.break_segment([cycle[-1]] + cycle + [cycle[0]]) + + def run(self, run_options, stats_keeper): + Log().quiet("Breaking CVSSymbol dependency loops...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + + shutil.copyfile( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_SYMBROKEN), + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_ALLBROKEN)) + self.cvs_item_to_changeset_id = CVSItemToChangesetTable( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_ALLBROKEN), + DB_OPEN_WRITE) + + self.changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_ALLBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_ALLBROKEN_INDEX), + DB_OPEN_NEW) + + self.changeset_graph = ChangesetGraph( + self.changeset_db, self.cvs_item_to_changeset_id + ) + + # A map {changeset_id : ordinal} for OrderedChangesets: + self.ordinals = {} + # A map {ordinal : changeset_id}: + ordered_changeset_map = {} + # A list of all BranchChangeset ids: + branch_changeset_ids = [] + max_changeset_id = 0 + for changeset in self.get_source_changesets(): + self.changeset_db.store(changeset) + self.changeset_graph.add_changeset(changeset) + if isinstance(changeset, OrderedChangeset): + ordered_changeset_map[changeset.ordinal] = changeset.id + self.ordinals[changeset.id] = changeset.ordinal + elif isinstance(changeset, BranchChangeset): + branch_changeset_ids.append(changeset.id) + max_changeset_id = max(max_changeset_id, changeset.id) + + # An array of ordered_changeset ids, indexed by ordinal: + ordered_changesets = [] + for ordinal in range(len(ordered_changeset_map)): + id = ordered_changeset_map[ordinal] + ordered_changesets.append(id) + + ordered_changeset_ids = set(ordered_changeset_map.values()) + del ordered_changeset_map + + self.changeset_key_generator = KeyGenerator(max_changeset_id + 1) + + # First we scan through all BranchChangesets looking for + # changesets that are individually "retrograde" and splitting + # those up: + for changeset_id in branch_changeset_ids: + self._split_if_retrograde(changeset_id) + + del self.ordinals + + next_ordered_changeset = 0 + + self.processed_changeset_logger = ProcessedChangesetLogger() + + while self.changeset_graph: + # Consume any nodes that don't have predecessors: + for (changeset, time_range) \ + in self.changeset_graph.consume_nopred_nodes(): + self.processed_changeset_logger.log(changeset.id) + if changeset.id in ordered_changeset_ids: + next_ordered_changeset += 1 + ordered_changeset_ids.remove(changeset.id) + + self.processed_changeset_logger.flush() + + if not self.changeset_graph: + break + + # Now work on the next ordered changeset that has not yet been + # processed. BreakSymbolChangesetCyclesPass has broken any + # cycles involving only SymbolChangesets, so the presence of a + # cycle implies that there is at least one ordered changeset + # left in the graph: + assert next_ordered_changeset < len(ordered_changesets) + + id = ordered_changesets[next_ordered_changeset] + path = self.changeset_graph.search_for_path(id, ordered_changeset_ids) + if path: + if Log().is_on(Log.DEBUG): + Log().debug('Breaking path from %s to %s' % (path[0], path[-1],)) + self.break_segment(path) + else: + # There were no ordered changesets among the reachable + # predecessors, so do generic cycle-breaking: + if Log().is_on(Log.DEBUG): + Log().debug( + 'Breaking generic cycle found from %s' + % (self.changeset_db[id],) + ) + self.break_cycle(self.changeset_graph.find_cycle(id)) + + del self.processed_changeset_logger + self.changeset_graph.close() + self.changeset_graph = None + self.cvs_item_to_changeset_id = None + self.changeset_db = None + + Log().quiet("Done") + + +class TopologicalSortPass(Pass): + """Sort changesets into commit order.""" + + def register_artifacts(self): + self._register_temp_file(config.CHANGESETS_SORTED_DATAFILE) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.CHANGESETS_ALLBROKEN_STORE) + self._register_temp_file_needed(config.CHANGESETS_ALLBROKEN_INDEX) + self._register_temp_file_needed(config.CVS_ITEM_TO_CHANGESET_ALLBROKEN) + + def get_source_changesets(self, changeset_db): + for changeset_id in changeset_db.keys(): + yield changeset_db[changeset_id] + + def get_changesets(self): + """Generate (changeset, timestamp) pairs in commit order.""" + + changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_ALLBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_ALLBROKEN_INDEX), + DB_OPEN_READ) + + changeset_graph = ChangesetGraph( + changeset_db, + CVSItemToChangesetTable( + artifact_manager.get_temp_file( + config.CVS_ITEM_TO_CHANGESET_ALLBROKEN + ), + DB_OPEN_READ, + ), + ) + symbol_changeset_ids = set() + + for changeset in self.get_source_changesets(changeset_db): + changeset_graph.add_changeset(changeset) + if isinstance(changeset, SymbolChangeset): + symbol_changeset_ids.add(changeset.id) + + # Ensure a monotonically-increasing timestamp series by keeping + # track of the previous timestamp and ensuring that the following + # one is larger. + timestamper = Timestamper() + + for (changeset, time_range) in changeset_graph.consume_graph(): + timestamp = timestamper.get( + time_range.t_max, changeset.id in symbol_changeset_ids + ) + yield (changeset, timestamp) + + changeset_graph.close() + + def run(self, run_options, stats_keeper): + Log().quiet("Generating CVSRevisions in commit order...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + + sorted_changesets = open( + artifact_manager.get_temp_file(config.CHANGESETS_SORTED_DATAFILE), + 'w') + + for (changeset, timestamp) in self.get_changesets(): + sorted_changesets.write('%x %08x\n' % (changeset.id, timestamp,)) + + sorted_changesets.close() + + Ctx()._cvs_items_db.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + Log().quiet("Done") + + +class CreateRevsPass(Pass): + """Generate the SVNCommit <-> CVSRevision mapping databases. + + SVNCommitCreator also calls SymbolingsLogger to register + CVSRevisions that represent an opening or closing for a path on a + branch or tag. See SymbolingsLogger for more details. + + This pass was formerly known as pass5.""" + + def register_artifacts(self): + self._register_temp_file(config.SVN_COMMITS_INDEX_TABLE) + self._register_temp_file(config.SVN_COMMITS_STORE) + self._register_temp_file(config.CVS_REVS_TO_SVN_REVNUMS) + self._register_temp_file(config.SYMBOL_OPENINGS_CLOSINGS) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.CHANGESETS_ALLBROKEN_STORE) + self._register_temp_file_needed(config.CHANGESETS_ALLBROKEN_INDEX) + self._register_temp_file_needed(config.CHANGESETS_SORTED_DATAFILE) + + def get_changesets(self): + """Generate (changeset,timestamp,) tuples in commit order.""" + + changeset_db = ChangesetDatabase( + artifact_manager.get_temp_file(config.CHANGESETS_ALLBROKEN_STORE), + artifact_manager.get_temp_file(config.CHANGESETS_ALLBROKEN_INDEX), + DB_OPEN_READ) + + for line in file( + artifact_manager.get_temp_file( + config.CHANGESETS_SORTED_DATAFILE)): + [changeset_id, timestamp] = [int(s, 16) for s in line.strip().split()] + yield (changeset_db[changeset_id], timestamp) + + changeset_db.close() + + def get_svn_commits(self, creator): + """Generate the SVNCommits, in order.""" + + for (changeset, timestamp) in self.get_changesets(): + for svn_commit in creator.process_changeset(changeset, timestamp): + yield svn_commit + + def log_svn_commit(self, svn_commit): + """Output information about SVN_COMMIT.""" + + Log().normal( + 'Creating Subversion r%d (%s)' + % (svn_commit.revnum, svn_commit.get_description(),) + ) + + if isinstance(svn_commit, SVNRevisionCommit): + for cvs_rev in svn_commit.cvs_revs: + Log().verbose(' %s %s' % (cvs_rev.cvs_path, cvs_rev.rev,)) + + def run(self, run_options, stats_keeper): + Log().quiet("Mapping CVS revisions to Subversion commits...") + + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + + Ctx()._symbolings_logger = SymbolingsLogger() + + persistence_manager = PersistenceManager(DB_OPEN_NEW) + + creator = SVNCommitCreator() + for svn_commit in self.get_svn_commits(creator): + self.log_svn_commit(svn_commit) + persistence_manager.put_svn_commit(svn_commit) + + stats_keeper.set_svn_rev_count(creator.revnum_generator.get_last_id()) + del creator + + persistence_manager.close() + Ctx()._symbolings_logger.close() + Ctx()._cvs_items_db.close() + Ctx()._symbol_db.close() + Ctx()._cvs_file_db.close() + + Log().quiet("Done") + + +class SortSymbolsPass(Pass): + """This pass was formerly known as pass6.""" + + def register_artifacts(self): + self._register_temp_file(config.SYMBOL_OPENINGS_CLOSINGS_SORTED) + self._register_temp_file_needed(config.SYMBOL_OPENINGS_CLOSINGS) + + def run(self, run_options, stats_keeper): + Log().quiet("Sorting symbolic name source revisions...") + + sort_file( + artifact_manager.get_temp_file(config.SYMBOL_OPENINGS_CLOSINGS), + artifact_manager.get_temp_file( + config.SYMBOL_OPENINGS_CLOSINGS_SORTED), + options=['-k', '1,1', '-k', '2,2n', '-k', '3'], + ) + Log().quiet("Done") + + +class IndexSymbolsPass(Pass): + """This pass was formerly known as pass7.""" + + def register_artifacts(self): + self._register_temp_file(config.SYMBOL_OFFSETS_DB) + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.SYMBOL_OPENINGS_CLOSINGS_SORTED) + + def generate_offsets_for_symbolings(self): + """This function iterates through all the lines in + SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping + SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED + where SYMBOLIC_NAME is first encountered. This will allow us to + seek to the various offsets in the file and sequentially read only + the openings and closings that we need.""" + + offsets = {} + + f = open( + artifact_manager.get_temp_file( + config.SYMBOL_OPENINGS_CLOSINGS_SORTED), + 'r') + old_id = None + while True: + fpos = f.tell() + line = f.readline() + if not line: + break + id, svn_revnum, ignored = line.split(" ", 2) + id = int(id, 16) + if id != old_id: + Log().verbose(' ', Ctx()._symbol_db.get_symbol(id).name) + old_id = id + offsets[id] = fpos + + f.close() + + offsets_db = file( + artifact_manager.get_temp_file(config.SYMBOL_OFFSETS_DB), 'wb') + cPickle.dump(offsets, offsets_db, -1) + offsets_db.close() + + def run(self, run_options, stats_keeper): + Log().quiet("Determining offsets for all symbolic names...") + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._symbol_db = SymbolDatabase() + self.generate_offsets_for_symbolings() + Ctx()._symbol_db.close() + Log().quiet("Done.") + + +class OutputPass(Pass): + """This pass was formerly known as pass8.""" + + def register_artifacts(self): + self._register_temp_file_needed(config.PROJECTS) + self._register_temp_file_needed(config.CVS_FILES_DB) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_STORE) + self._register_temp_file_needed(config.CVS_ITEMS_SORTED_INDEX_TABLE) + self._register_temp_file_needed(config.SYMBOL_DB) + self._register_temp_file_needed(config.METADATA_CLEAN_INDEX_TABLE) + self._register_temp_file_needed(config.METADATA_CLEAN_STORE) + self._register_temp_file_needed(config.SVN_COMMITS_INDEX_TABLE) + self._register_temp_file_needed(config.SVN_COMMITS_STORE) + self._register_temp_file_needed(config.CVS_REVS_TO_SVN_REVNUMS) + Ctx().output_option.register_artifacts(self) + + def get_svn_commits(self): + """Generate the SVNCommits in commit order.""" + + persistence_manager = PersistenceManager(DB_OPEN_READ) + + svn_revnum = 1 # The first non-trivial commit + + # Peek at the first revision to find the date to use to initialize + # the repository: + svn_commit = persistence_manager.get_svn_commit(svn_revnum) + + while svn_commit: + yield svn_commit + svn_revnum += 1 + svn_commit = persistence_manager.get_svn_commit(svn_revnum) + + persistence_manager.close() + + def run(self, run_options, stats_keeper): + Ctx()._projects = read_projects( + artifact_manager.get_temp_file(config.PROJECTS) + ) + Ctx()._cvs_file_db = CVSFileDatabase(DB_OPEN_READ) + Ctx()._metadata_db = MetadataDatabase( + artifact_manager.get_temp_file(config.METADATA_CLEAN_STORE), + artifact_manager.get_temp_file(config.METADATA_CLEAN_INDEX_TABLE), + DB_OPEN_READ, + ) + Ctx()._cvs_items_db = IndexedCVSItemStore( + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_STORE), + artifact_manager.get_temp_file(config.CVS_ITEMS_SORTED_INDEX_TABLE), + DB_OPEN_READ) + Ctx()._symbol_db = SymbolDatabase() + + Ctx().output_option.setup(stats_keeper.svn_rev_count()) + + for svn_commit in self.get_svn_commits(): + svn_commit.output(Ctx().output_option) + + Ctx().output_option.cleanup() + + Ctx()._symbol_db.close() + Ctx()._cvs_items_db.close() + Ctx()._metadata_db.close() + Ctx()._cvs_file_db.close() + + +# The list of passes constituting a run of cvs2svn: +passes = [ + CollectRevsPass(), + CleanMetadataPass(), + CollateSymbolsPass(), + #CheckItemStoreDependenciesPass(config.CVS_ITEMS_STORE), + FilterSymbolsPass(), + SortRevisionSummaryPass(), + SortSymbolSummaryPass(), + InitializeChangesetsPass(), + #CheckIndexedItemStoreDependenciesPass( + # config.CVS_ITEMS_SORTED_STORE, + # config.CVS_ITEMS_SORTED_INDEX_TABLE), + BreakRevisionChangesetCyclesPass(), + RevisionTopologicalSortPass(), + BreakSymbolChangesetCyclesPass(), + BreakAllChangesetCyclesPass(), + TopologicalSortPass(), + CreateRevsPass(), + SortSymbolsPass(), + IndexSymbolsPass(), + OutputPass(), + ] + + diff --git a/cvs2svn_lib/persistence_manager.py b/cvs2svn_lib/persistence_manager.py new file mode 100644 index 0000000..8a622ab --- /dev/null +++ b/cvs2svn_lib/persistence_manager.py @@ -0,0 +1,106 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains class PersistenceManager.""" + + +from cvs2svn_lib import config +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.common import SVN_INVALID_REVNUM +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.record_table import SignedIntegerPacker +from cvs2svn_lib.record_table import RecordTable +from cvs2svn_lib.serializer import PrimedPickleSerializer +from cvs2svn_lib.database import IndexedDatabase +from cvs2svn_lib.svn_commit import SVNRevisionCommit +from cvs2svn_lib.svn_commit import SVNInitialProjectCommit +from cvs2svn_lib.svn_commit import SVNPrimaryCommit +from cvs2svn_lib.svn_commit import SVNBranchCommit +from cvs2svn_lib.svn_commit import SVNTagCommit +from cvs2svn_lib.svn_commit import SVNPostCommit + + +class PersistenceManager: + """The PersistenceManager allows us to effectively store SVNCommits + to disk and retrieve them later using only their subversion revision + number as the key. It also returns the subversion revision number + for a given CVSRevision's unique key. + + All information pertinent to each SVNCommit is stored in a series of + on-disk databases so that SVNCommits can be retrieved on-demand. + + MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ. + In 'new' mode, PersistenceManager will initialize a new set of on-disk + databases and be fully-featured. + In 'read' mode, PersistenceManager will open existing on-disk databases + and the set_* methods will be unavailable.""" + + def __init__(self, mode): + self.mode = mode + if mode not in (DB_OPEN_NEW, DB_OPEN_READ): + raise RuntimeError, "Invalid 'mode' argument to PersistenceManager" + primer = ( + SVNInitialProjectCommit, + SVNPrimaryCommit, + SVNPostCommit, + SVNBranchCommit, + SVNTagCommit, + ) + serializer = PrimedPickleSerializer(primer) + self.svn_commit_db = IndexedDatabase( + artifact_manager.get_temp_file(config.SVN_COMMITS_INDEX_TABLE), + artifact_manager.get_temp_file(config.SVN_COMMITS_STORE), + mode, serializer) + self.cvs2svn_db = RecordTable( + artifact_manager.get_temp_file(config.CVS_REVS_TO_SVN_REVNUMS), + mode, SignedIntegerPacker(SVN_INVALID_REVNUM)) + + def get_svn_revnum(self, cvs_rev_id): + """Return the Subversion revision number in which CVS_REV_ID was + committed, or SVN_INVALID_REVNUM if there is no mapping for + CVS_REV_ID.""" + + return self.cvs2svn_db.get(cvs_rev_id, SVN_INVALID_REVNUM) + + def get_svn_commit(self, svn_revnum): + """Return an SVNCommit that corresponds to SVN_REVNUM. + + If no SVNCommit exists for revnum SVN_REVNUM, then return None.""" + + return self.svn_commit_db.get(svn_revnum, None) + + def put_svn_commit(self, svn_commit): + """Record the bidirectional mapping between SVN_REVNUM and + CVS_REVS and record associated attributes.""" + + if self.mode == DB_OPEN_READ: + raise RuntimeError, \ + 'Write operation attempted on read-only PersistenceManager' + + self.svn_commit_db[svn_commit.revnum] = svn_commit + + if isinstance(svn_commit, SVNRevisionCommit): + for cvs_rev in svn_commit.cvs_revs: + self.cvs2svn_db[cvs_rev.id] = svn_commit.revnum + + def close(self): + self.cvs2svn_db.close() + self.cvs2svn_db = None + self.svn_commit_db.close() + self.svn_commit_db = None + + diff --git a/cvs2svn_lib/process.py b/cvs2svn_lib/process.py new file mode 100644 index 0000000..56469ce --- /dev/null +++ b/cvs2svn_lib/process.py @@ -0,0 +1,116 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains generic utilities used by cvs2svn.""" + + +import subprocess + +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import CommandError + + +def call_command(command, **kw): + """Call the specified command, checking that it exits successfully. + + Raise a FatalError if the command cannot be executed, or if it exits + with a non-zero exit code. Pass KW as keyword arguments to + subprocess.call().""" + + try: + retcode = subprocess.call(command, **kw) + if retcode < 0: + raise FatalError( + 'Command terminated by signal %d: "%s"' + % (-retcode, ' '.join(command),) + ) + elif retcode > 0: + raise FatalError( + 'Command failed with return code %d: "%s"' + % (retcode, ' '.join(command),) + ) + except OSError, e: + raise FatalError( + 'Command execution failed (%s): "%s"' + % (e, ' '.join(command),) + ) + + +class CommandFailedException(Exception): + """Exception raised if check_command_runs() fails.""" + + pass + + +def check_command_runs(cmd, cmdname): + """Check whether the command CMD can be executed without errors. + + CMD is a list or string, as accepted by subprocess.Popen(). CMDNAME + is the name of the command as it should be included in exception + error messages. + + This function checks three things: (1) the command can be run + without throwing an OSError; (2) it exits with status=0; (3) it + doesn't output anything to stderr. If any of these conditions is + not met, raise a CommandFailedException describing the problem.""" + + try: + pipe = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + except OSError, e: + raise CommandFailedException('error executing %s: %s' % (cmdname, e,)) + pipe.stdin.close() + pipe.stdout.read() + errmsg = pipe.stderr.read() + status = pipe.wait() + if status or errmsg: + msg = 'error executing %s: status %s' % (cmdname, status,) + if errmsg: + msg += ', error output:\n%s' % (errmsg,) + raise CommandFailedException(msg) + + +class PipeStream(object): + """A file-like object from which revision contents can be read.""" + + def __init__(self, pipe_command): + self._pipe_command_str = ' '.join(pipe_command) + self.pipe = subprocess.Popen( + pipe_command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + self.pipe.stdin.close() + + def read(self, size=None): + if size is None: + return self.pipe.stdout.read() + else: + return self.pipe.stdout.read(size) + + def close(self): + self.pipe.stdout.close() + error_output = self.pipe.stderr.read() + exit_status = self.pipe.wait() + if exit_status: + raise CommandError(self._pipe_command_str, exit_status, error_output) + + diff --git a/cvs2svn_lib/project.py b/cvs2svn_lib/project.py new file mode 100644 index 0000000..0fe92df --- /dev/null +++ b/cvs2svn_lib/project.py @@ -0,0 +1,219 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains database facilities used by cvs2svn.""" + + +import re +import os +import cPickle + +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import IllegalSVNPathError +from cvs2svn_lib.common import normalize_svn_path +from cvs2svn_lib.common import verify_paths_disjoint +from cvs2svn_lib.symbol_transform import CompoundSymbolTransform + + +class FileInAndOutOfAtticException(Exception): + def __init__(self, non_attic_path, attic_path): + Exception.__init__( + self, + "A CVS repository cannot contain both %s and %s" + % (non_attic_path, attic_path)) + + self.non_attic_path = non_attic_path + self.attic_path = attic_path + + +def normalize_ttb_path(opt, path, allow_empty=False): + try: + return normalize_svn_path(path, allow_empty) + except IllegalSVNPathError, e: + raise FatalError('Problem with %s: %s' % (opt, e,)) + + +class Project(object): + """A project within a CVS repository.""" + + def __init__( + self, id, project_cvs_repos_path, + initial_directories=[], + symbol_transforms=None, + ): + """Create a new Project record. + + ID is a unique id for this project. PROJECT_CVS_REPOS_PATH is the + main CVS directory for this project (within the filesystem). + + INITIAL_DIRECTORIES is an iterable of all SVN directories that + should be created when the project is first created. Normally, + this should include the trunk, branches, and tags directory. + + SYMBOL_TRANSFORMS is an iterable of SymbolTransform instances + which will be used to transform any symbol names within this + project.""" + + self.id = id + + self.project_cvs_repos_path = os.path.normpath(project_cvs_repos_path) + if not os.path.isdir(self.project_cvs_repos_path): + raise FatalError("The specified CVS repository path '%s' is not an " + "existing directory." % self.project_cvs_repos_path) + + self.cvs_repository_root, self.cvs_module = \ + self.determine_repository_root( + os.path.abspath(self.project_cvs_repos_path)) + + # A regexp matching project_cvs_repos_path plus an optional separator: + self.project_prefix_re = re.compile( + r'^' + re.escape(self.project_cvs_repos_path) + + r'(' + re.escape(os.sep) + r'|$)') + + # The SVN directories to add when the project is first created: + self._initial_directories = [] + + for path in initial_directories: + try: + path = normalize_svn_path(path, False) + except IllegalSVNPathError, e: + raise FatalError( + 'Initial directory %r is not a legal SVN path: %s' + % (path, e,) + ) + self._initial_directories.append(path) + + verify_paths_disjoint(*self._initial_directories) + + # A list of transformation rules (regexp, replacement) applied to + # symbol names in this project. + if symbol_transforms is None: + symbol_transforms = [] + + self.symbol_transform = CompoundSymbolTransform(symbol_transforms) + + # The ID of the Trunk instance for this Project. This member is + # filled in during CollectRevsPass. + self.trunk_id = None + + # The ID of the CVSDirectory representing the root directory of + # this project. This member is filled in during CollectRevsPass. + self.root_cvs_directory_id = None + + def __eq__(self, other): + return self.id == other.id + + def __cmp__(self, other): + return cmp(self.cvs_module, other.cvs_module) \ + or cmp(self.id, other.id) + + def __hash__(self): + return self.id + + @staticmethod + def determine_repository_root(path): + """Ascend above the specified PATH if necessary to find the + cvs_repository_root (a directory containing a CVSROOT directory) + and the cvs_module (the path of the conversion root within the cvs + repository). Return the root path and the module path of this + project relative to the root. + + NB: cvs_module must be seperated by '/', *not* by os.sep.""" + + def is_cvs_repository_root(path): + return os.path.isdir(os.path.join(path, 'CVSROOT')) + + original_path = path + cvs_module = '' + while not is_cvs_repository_root(path): + # Step up one directory: + prev_path = path + path, module_component = os.path.split(path) + if path == prev_path: + # Hit the root (of the drive, on Windows) without finding a + # CVSROOT dir. + raise FatalError( + "the path '%s' is not a CVS repository, nor a path " + "within a CVS repository. A CVS repository contains " + "a CVSROOT directory within its root directory." + % (original_path,)) + + cvs_module = module_component + "/" + cvs_module + + return path, cvs_module + + def transform_symbol(self, cvs_file, symbol_name, revision): + """Transform the symbol SYMBOL_NAME. + + SYMBOL_NAME refers to revision number REVISION in CVS_FILE. + REVISION is the CVS revision number as a string, with zeros + removed (e.g., '1.7' or '1.7.2'). Use the renaming rules + specified with --symbol-transform to possibly rename the symbol. + Return the transformed symbol name, the original name if it should + not be transformed, or None if the symbol should be omitted from + the conversion.""" + + return self.symbol_transform.transform(cvs_file, symbol_name, revision) + + def get_trunk(self): + """Return the Trunk instance for this project. + + This method can only be called after self.trunk_id has been + initialized in CollectRevsPass.""" + + return Ctx()._symbol_db.get_symbol(self.trunk_id) + + def get_root_cvs_directory(self): + """Return the root CVSDirectory instance for this project. + + This method can only be called after self.root_cvs_directory_id + has been initialized in CollectRevsPass.""" + + return Ctx()._cvs_file_db.get_file(self.root_cvs_directory_id) + + def get_initial_directories(self): + """Generate the project's initial SVN directories. + + Yield as strings the SVN paths of directories that should be + created when the project is first created.""" + + # Yield the path of the Trunk symbol for this project (which might + # differ from the one passed to the --trunk option because of + # SymbolStrategyRules). The trunk path might be '' during a + # trunk-only conversion, but that is OK because DumpfileDelegate + # considers that directory to exist already and will therefore + # ignore it: + yield self.get_trunk().base_path + + for path in self._initial_directories: + yield path + + def __str__(self): + return self.project_cvs_repos_path + + +def read_projects(filename): + retval = {} + for project in cPickle.load(open(filename, 'rb')): + retval[project.id] = project + return retval + + +def write_projects(filename): + cPickle.dump(Ctx()._projects.values(), open(filename, 'wb'), -1) + + diff --git a/cvs2svn_lib/property_setters.py b/cvs2svn_lib/property_setters.py new file mode 100644 index 0000000..7cf379e --- /dev/null +++ b/cvs2svn_lib/property_setters.py @@ -0,0 +1,385 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to set Subversion properties on files.""" + + +import os +import re +import fnmatch +import ConfigParser +from cStringIO import StringIO + +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.log import Log + + +class SVNPropertySetter: + """Abstract class for objects that can set properties on a SVNCommitItem.""" + + def set_properties(self, s_item): + """Set any properties that can be determined for S_ITEM. + + S_ITEM is an instance of SVNCommitItem. This method should modify + S_ITEM.svn_props in place.""" + + raise NotImplementedError + + +class CVSRevisionNumberSetter(SVNPropertySetter): + """Set the cvs2svn:cvs-rev property to the CVS revision number.""" + + propname = 'cvs2svn:cvs-rev' + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + s_item.svn_props[self.propname] = s_item.cvs_rev.rev + s_item.svn_props_changed = True + + +class ExecutablePropertySetter(SVNPropertySetter): + """Set the svn:executable property based on cvs_rev.cvs_file.executable.""" + + propname = 'svn:executable' + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + if s_item.cvs_rev.cvs_file.executable: + s_item.svn_props[self.propname] = '*' + + +class CVSBinaryFileEOLStyleSetter(SVNPropertySetter): + """Set the eol-style to None for files with CVS mode '-kb'.""" + + propname = 'svn:eol-style' + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + if s_item.cvs_rev.cvs_file.mode == 'b': + s_item.svn_props[self.propname] = None + + +class MimeMapper(SVNPropertySetter): + """A class that provides mappings from file names to MIME types.""" + + propname = 'svn:mime-type' + + def __init__(self, mime_types_file): + self.mappings = { } + + for line in file(mime_types_file): + if line.startswith("#"): + continue + + # format of a line is something like + # text/plain c h cpp + extensions = line.split() + if len(extensions) < 2: + continue + type = extensions.pop(0) + for ext in extensions: + if ext in self.mappings and self.mappings[ext] != type: + Log().error( + "%s: ambiguous MIME mapping for *.%s (%s or %s)\n" + % (warning_prefix, ext, self.mappings[ext], type) + ) + self.mappings[ext] = type + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + basename, extension = os.path.splitext(s_item.cvs_rev.cvs_file.basename) + + # Extension includes the dot, so strip it (will leave extension + # empty if filename ends with a dot, which is ok): + extension = extension[1:] + + # If there is no extension (or the file ends with a period), use + # the base name for mapping. This allows us to set mappings for + # files such as README or Makefile: + if not extension: + extension = basename + + mime_type = self.mappings.get(extension, None) + if mime_type is not None: + s_item.svn_props[self.propname] = mime_type + + +class AutoPropsPropertySetter(SVNPropertySetter): + """Set arbitrary svn properties based on an auto-props configuration. + + This class supports case-sensitive or case-insensitive pattern + matching. The command-line default is case-insensitive behavior, + consistent with Subversion (see + http://subversion.tigris.org/issues/show_bug.cgi?id=2036). + + As a special extension to Subversion's auto-props handling, if a + property name is preceded by a '!' then that property is forced to + be left unset. + + If a property specified in auto-props has already been set to a + different value, print a warning and leave the old property value + unchanged. + + Python's treatment of whitespaces in the ConfigParser module is + buggy and inconsistent. Usually spaces are preserved, but if there + is at least one semicolon in the value, and the *first* semicolon is + preceded by a space, then that is treated as the start of a comment + and the rest of the line is silently discarded.""" + + property_name_pattern = r'(?P[^\!\=\s]+)' + property_unset_re = re.compile( + r'^\!\s*' + property_name_pattern + r'$' + ) + property_set_re = re.compile( + r'^' + property_name_pattern + r'\s*\=\s*(?P.*)$' + ) + property_novalue_re = re.compile( + r'^' + property_name_pattern + r'$' + ) + + quoted_re = re.compile( + r'^([\'\"]).*\1$' + ) + comment_re = re.compile(r'\s;') + + class Pattern: + """Describes the properties to be set for files matching a pattern.""" + + def __init__(self, pattern, propdict): + # A glob-like pattern: + self.pattern = pattern + # A dictionary of properties that should be set: + self.propdict = propdict + + def match(self, basename): + """Does the file with the specified basename match pattern?""" + + return fnmatch.fnmatch(basename, self.pattern) + + def __init__(self, configfilename, ignore_case=True): + config = ConfigParser.ConfigParser() + if ignore_case: + self.transform_case = self.squash_case + else: + config.optionxform = self.preserve_case + self.transform_case = self.preserve_case + + configtext = open(configfilename).read() + if self.comment_re.search(configtext): + Log().warn( + '%s: Please be aware that a space followed by a\n' + 'semicolon is sometimes treated as a comment in configuration\n' + 'files. This pattern was seen in\n' + ' %s\n' + 'Please make sure that you have not inadvertently commented\n' + 'out part of an important line.' + % (warning_prefix, configfilename,) + ) + + config.readfp(StringIO(configtext), configfilename) + self.patterns = [] + sections = config.sections() + sections.sort() + for section in sections: + if self.transform_case(section) == 'auto-props': + patterns = config.options(section) + patterns.sort() + for pattern in patterns: + value = config.get(section, pattern) + if value: + self._add_pattern(pattern, value) + + def squash_case(self, s): + return s.lower() + + def preserve_case(self, s): + return s + + def _add_pattern(self, pattern, props): + propdict = {} + if self.quoted_re.match(pattern): + Log().warn( + '%s: Quoting is not supported in auto-props; please verify rule\n' + 'for %r. (Using pattern including quotation marks.)\n' + % (warning_prefix, pattern,) + ) + for prop in props.split(';'): + prop = prop.strip() + m = self.property_unset_re.match(prop) + if m: + name = m.group('name') + Log().debug( + 'auto-props: For %r, leaving %r unset.' % (pattern, name,) + ) + propdict[name] = None + continue + + m = self.property_set_re.match(prop) + if m: + name = m.group('name') + value = m.group('value') + if self.quoted_re.match(value): + Log().warn( + '%s: Quoting is not supported in auto-props; please verify\n' + 'rule %r for pattern %r. (Using value\n' + 'including quotation marks.)\n' + % (warning_prefix, prop, pattern,) + ) + Log().debug( + 'auto-props: For %r, setting %r to %r.' % (pattern, name, value,) + ) + propdict[name] = value + continue + + m = self.property_novalue_re.match(prop) + if m: + name = m.group('name') + Log().debug( + 'auto-props: For %r, setting %r to the empty string' + % (pattern, name,) + ) + propdict[name] = '' + continue + + Log().warn( + '%s: in auto-props line for %r, value %r cannot be parsed (ignored)' + % (warning_prefix, pattern, prop,) + ) + + self.patterns.append(self.Pattern(self.transform_case(pattern), propdict)) + + def get_propdict(self, cvs_file): + basename = self.transform_case(cvs_file.basename) + propdict = {} + for pattern in self.patterns: + if pattern.match(basename): + for (key,value) in pattern.propdict.items(): + if key in propdict: + if propdict[key] != value: + Log().warn( + "Contradictory values set for property '%s' for file %s." + % (key, cvs_file,)) + else: + propdict[key] = value + + return propdict + + def set_properties(self, s_item): + propdict = self.get_propdict(s_item.cvs_rev.cvs_file) + for (k,v) in propdict.items(): + if k in s_item.svn_props: + if s_item.svn_props[k] != v: + Log().warn( + "Property '%s' already set to %r for file %s; " + "auto-props value (%r) ignored." + % (k, s_item.svn_props[k], s_item.cvs_rev.cvs_path, v,)) + else: + s_item.svn_props[k] = v + + +class CVSBinaryFileDefaultMimeTypeSetter(SVNPropertySetter): + """If the file is binary and its svn:mime-type property is not yet + set, set it to 'application/octet-stream'.""" + + propname = 'svn:mime-type' + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + if s_item.cvs_rev.cvs_file.mode == 'b': + s_item.svn_props[self.propname] = 'application/octet-stream' + + +class EOLStyleFromMimeTypeSetter(SVNPropertySetter): + """Set svn:eol-style based on svn:mime-type. + + If svn:mime-type is known but svn:eol-style is not, then set + svn:eol-style based on svn:mime-type as follows: if svn:mime-type + starts with 'text/', then set svn:eol-style to native; otherwise, + force it to remain unset. See also issue #39.""" + + propname = 'svn:eol-style' + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + if s_item.svn_props.get('svn:mime-type', None) is not None: + if s_item.svn_props['svn:mime-type'].startswith("text/"): + s_item.svn_props[self.propname] = 'native' + else: + s_item.svn_props[self.propname] = None + + +class DefaultEOLStyleSetter(SVNPropertySetter): + """Set the eol-style if one has not already been set.""" + + propname = 'svn:eol-style' + + def __init__(self, value): + """Initialize with the specified default VALUE.""" + + self.value = value + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + s_item.svn_props[self.propname] = self.value + + +class SVNBinaryFileKeywordsPropertySetter(SVNPropertySetter): + """Turn off svn:keywords for files with binary svn:eol-style.""" + + propname = 'svn:keywords' + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + if not s_item.svn_props.get('svn:eol-style'): + s_item.svn_props[self.propname] = None + + +class KeywordsPropertySetter(SVNPropertySetter): + """If the svn:keywords property is not yet set, set it based on the + file's mode. See issue #2.""" + + propname = 'svn:keywords' + + def __init__(self, value): + """Use VALUE for the value of the svn:keywords property if it is + to be set.""" + + self.value = value + + def set_properties(self, s_item): + if self.propname in s_item.svn_props: + return + + if s_item.cvs_rev.cvs_file.mode in [None, 'kv', 'kvl']: + s_item.svn_props[self.propname] = self.value + + diff --git a/cvs2svn_lib/rcs_revision_manager.py b/cvs2svn_lib/rcs_revision_manager.py new file mode 100644 index 0000000..1c2dfcf --- /dev/null +++ b/cvs2svn_lib/rcs_revision_manager.py @@ -0,0 +1,51 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Access the CVS repository via RCS's 'co' command.""" + + +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.process import check_command_runs +from cvs2svn_lib.process import PipeStream +from cvs2svn_lib.process import CommandFailedException +from cvs2svn_lib.revision_manager import RevisionReader + + +class RCSRevisionReader(RevisionReader): + """A RevisionReader that reads the contents via RCS.""" + + def __init__(self, co_executable): + self.co_executable = co_executable + try: + check_command_runs([self.co_executable, '-V'], self.co_executable) + except CommandFailedException, e: + raise FatalError('%s\n' + 'Please check that co is installed and in your PATH\n' + '(it is a part of the RCS software).' % (e,)) + + def get_content_stream(self, cvs_rev, suppress_keyword_substitution=False): + pipe_cmd = [ + self.co_executable, + '-q', + '-x,v', + '-p%s' % (cvs_rev.rev,) + ] + if suppress_keyword_substitution: + pipe_cmd.append('-kk') + pipe_cmd.append(cvs_rev.cvs_file.filename) + return PipeStream(pipe_cmd) + + diff --git a/cvs2svn_lib/rcs_stream.py b/cvs2svn_lib/rcs_stream.py new file mode 100644 index 0000000..b893819 --- /dev/null +++ b/cvs2svn_lib/rcs_stream.py @@ -0,0 +1,149 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module processes RCS diffs (deltas).""" + + +import re + +def msplit(s): + """Split S into an array of lines. + + Only \n is a line separator. The line endings are part of the lines.""" + + # return s.splitlines(True) clobbers \r + re = [ i + "\n" for i in s.split("\n") ] + re[-1] = re[-1][:-1] + if not re[-1]: + del re[-1] + return re + + +class MalformedDeltaException(Exception): + """A malformed RCS delta was encountered.""" + + pass + +class RCSStream: + """This class represents a single file object to which RCS deltas can be + applied in various ways.""" + + ad_command = re.compile(r'^([ad])(\d+)\s(\d+)\n$') + a_command = re.compile(r'^a(\d+)\s(\d+)\n$') + + def __init__(self, text): + """Instantiate and initialize the file content with TEXT.""" + + self._texts = msplit(text) + + def get_text(self): + """Return the current file content.""" + + return "".join(self._texts) + + def apply_diff(self, diff): + """Apply the RCS diff DIFF to the current file content.""" + + ntexts = [] + ooff = 0 + diffs = msplit(diff) + i = 0 + while i < len(diffs): + admatch = self.ad_command.match(diffs[i]) + if not admatch: + raise MalformedDeltaException('Bad ed command') + i += 1 + sl = int(admatch.group(2)) + cn = int(admatch.group(3)) + if admatch.group(1) == 'd': # "d" - Delete command + sl -= 1 + if sl < ooff: + raise MalformedDeltaException('Deletion before last edit') + if sl > len(self._texts): + raise MalformedDeltaException('Deletion past file end') + if sl + cn > len(self._texts): + raise MalformedDeltaException('Deletion beyond file end') + ntexts += self._texts[ooff:sl] + ooff = sl + cn + else: # "a" - Add command + if sl < ooff: # Also catches same place + raise MalformedDeltaException('Insertion before last edit') + if sl > len(self._texts): + raise MalformedDeltaException('Insertion past file end') + ntexts += self._texts[ooff:sl] + diffs[i:i + cn] + ooff = sl + i += cn + self._texts = ntexts + self._texts[ooff:] + + def invert_diff(self, diff): + """Apply the RCS diff DIFF to the current file content and simultaneously + generate an RCS diff suitable for reverting the change.""" + + ntexts = [] + ooff = 0 + diffs = msplit(diff) + ndiffs = [] + adjust = 0 + i = 0 + while i < len(diffs): + admatch = self.ad_command.match(diffs[i]) + if not admatch: + raise MalformedDeltaException('Bad ed command') + i += 1 + sl = int(admatch.group(2)) + cn = int(admatch.group(3)) + if admatch.group(1) == 'd': # "d" - Delete command + sl -= 1 + if sl < ooff: + raise MalformedDeltaException('Deletion before last edit') + if sl > len(self._texts): + raise MalformedDeltaException('Deletion past file end') + if sl + cn > len(self._texts): + raise MalformedDeltaException('Deletion beyond file end') + # Handle substitution explicitly, as add must come after del + # (last add may end in no newline, so no command can follow). + if i < len(diffs): + amatch = self.a_command.match(diffs[i]) + else: + amatch = None + if amatch and int(amatch.group(1)) == sl + cn: + cn2 = int(amatch.group(2)) + i += 1 + ndiffs += ["d%d %d\na%d %d\n" % \ + (sl + 1 + adjust, cn2, sl + adjust + cn2, cn)] + \ + self._texts[sl:sl + cn] + ntexts += self._texts[ooff:sl] + diffs[i:i + cn2] + adjust += cn2 - cn + i += cn2 + else: + ndiffs += ["a%d %d\n" % (sl + adjust, cn)] + \ + self._texts[sl:sl + cn] + ntexts += self._texts[ooff:sl] + adjust -= cn + ooff = sl + cn + else: # "a" - Add command + if sl < ooff: # Also catches same place + raise MalformedDeltaException('Insertion before last edit') + if sl > len(self._texts): + raise MalformedDeltaException('Insertion past file end') + ndiffs += ["d%d %d\n" % (sl + 1 + adjust, cn)] + ntexts += self._texts[ooff:sl] + diffs[i:i + cn] + ooff = sl + adjust += cn + i += cn + self._texts = ntexts + self._texts[ooff:] + return "".join(ndiffs) + diff --git a/cvs2svn_lib/record_table.py b/cvs2svn_lib/record_table.py new file mode 100644 index 0000000..41ab84a --- /dev/null +++ b/cvs2svn_lib/record_table.py @@ -0,0 +1,399 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Classes to manage Databases of fixed-length records. + +The databases map small, non-negative integers to fixed-size records. +The records are written in index order to a disk file. Gaps in the +index sequence leave gaps in the data file, so for best space +efficiency the indexes of existing records should be approximately +continuous. + +To use a RecordTable, you need a class derived from Packer which can +serialize/deserialize your records into fixed-size strings. Deriving +classes have to specify how to pack records into strings and unpack +strings into records by overwriting the pack() and unpack() methods +respectively. + +Note that these classes keep track of gaps in the records that have +been written by filling them with packer.empty_value. If a record is +read which contains packer.empty_value, then a KeyError is raised.""" + + +import os +import types +import struct +import mmap + +from cvs2svn_lib.common import DB_OPEN_READ +from cvs2svn_lib.common import DB_OPEN_WRITE +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.log import Log + + +# A unique value that can be used to stand for "unset" without +# preventing the use of None. +_unset = object() + + +class Packer(object): + def __init__(self, record_len, empty_value=None): + self.record_len = record_len + if empty_value is None: + self.empty_value = '\0' * self.record_len + else: + assert type(empty_value) is types.StringType + assert len(empty_value) == self.record_len + self.empty_value = empty_value + + def pack(self, v): + """Pack record V into a string of length self.record_len.""" + + raise NotImplementedError() + + def unpack(self, s): + """Unpack string S into a record.""" + + raise NotImplementedError() + + +class StructPacker(Packer): + def __init__(self, format, empty_value=_unset): + self.format = format + if empty_value is not _unset: + empty_value = self.pack(empty_value) + else: + empty_value = None + + Packer.__init__(self, struct.calcsize(self.format), + empty_value=empty_value) + + def pack(self, v): + return struct.pack(self.format, v) + + def unpack(self, v): + return struct.unpack(self.format, v)[0] + + +class UnsignedIntegerPacker(StructPacker): + def __init__(self, empty_value=0): + StructPacker.__init__(self, '=I', empty_value) + + +class SignedIntegerPacker(StructPacker): + def __init__(self, empty_value=0): + StructPacker.__init__(self, '=i', empty_value) + + +class FileOffsetPacker(Packer): + """A packer suitable for file offsets. + + We store the 5 least significant bytes of the file offset. This is + enough bits to represent 1 TiB. Of course if the computer + doesn't have large file support, only the lowest 31 bits can be + nonzero, and the offsets are limited to 2 GiB.""" + + # Convert file offsets to 8-bit little-endian unsigned longs... + INDEX_FORMAT = '= self._max_memory_cache: + self.flush() + self._limit = max(self._limit, i + 1) + + def _get_packed_record(self, i): + try: + return self._cache[i][1] + except KeyError: + if not 0 <= i < self._limit_written: + raise KeyError(i) + self.f.seek(i * self._record_len) + s = self.f.read(self._record_len) + self._cache[i] = (False, s) + if len(self._cache) >= self._max_memory_cache: + self.flush() + + return s + + def close(self): + self.flush() + self._cache = None + self.f.close() + self.f = None + + +class MmapRecordTable(AbstractRecordTable): + GROWTH_INCREMENT = 65536 + + def __init__(self, filename, mode, packer): + AbstractRecordTable.__init__(self, filename, mode, packer) + if self.mode == DB_OPEN_NEW: + self.python_file = open(self.filename, 'wb+') + self.python_file.write('\0' * self.GROWTH_INCREMENT) + self.python_file.flush() + self._filesize = self.GROWTH_INCREMENT + self.f = mmap.mmap( + self.python_file.fileno(), self._filesize, + access=mmap.ACCESS_WRITE + ) + + # The index just beyond the last record ever written: + self._limit = 0 + elif self.mode == DB_OPEN_WRITE: + self.python_file = open(self.filename, 'rb+') + self._filesize = os.path.getsize(self.filename) + self.f = mmap.mmap( + self.python_file.fileno(), self._filesize, + access=mmap.ACCESS_WRITE + ) + + # The index just beyond the last record ever written: + self._limit = os.path.getsize(self.filename) // self._record_len + elif self.mode == DB_OPEN_READ: + self.python_file = open(self.filename, 'rb') + self._filesize = os.path.getsize(self.filename) + self.f = mmap.mmap( + self.python_file.fileno(), self._filesize, + access=mmap.ACCESS_READ + ) + + # The index just beyond the last record ever written: + self._limit = os.path.getsize(self.filename) // self._record_len + else: + raise RuntimeError('Invalid mode %r' % self.mode) + + def flush(self): + self.f.flush() + + def _set_packed_record(self, i, s): + if self.mode == DB_OPEN_READ: + raise RecordTableAccessError() + if i < 0: + raise KeyError() + if i >= self._limit: + # This write extends the range of valid indices. First check + # whether the file has to be enlarged: + new_size = (i + 1) * self._record_len + if new_size > self._filesize: + self._filesize = ( + (new_size + self.GROWTH_INCREMENT - 1) + // self.GROWTH_INCREMENT + * self.GROWTH_INCREMENT + ) + self.f.resize(self._filesize) + if i > self._limit: + # Pad up to the new record with empty_value: + self.f[self._limit * self._record_len:i * self._record_len] = \ + self.packer.empty_value * (i - self._limit) + self._limit = i + 1 + + self.f[i * self._record_len:(i + 1) * self._record_len] = s + + def _get_packed_record(self, i): + if not 0 <= i < self._limit: + raise KeyError(i) + return self.f[i * self._record_len:(i + 1) * self._record_len] + + def close(self): + self.flush() + self.f.close() + self.python_file.close() + + diff --git a/cvs2svn_lib/repository_delegate.py b/cvs2svn_lib/repository_delegate.py new file mode 100644 index 0000000..53c9b65 --- /dev/null +++ b/cvs2svn_lib/repository_delegate.py @@ -0,0 +1,98 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains class RepositoryDelegate.""" + + +import os +import subprocess + +from cvs2svn_lib.common import CommandError +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.config import DUMPFILE +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.dumpfile_delegate import DumpfileDelegate + + +class RepositoryDelegate(DumpfileDelegate): + """Creates a new Subversion Repository. DumpfileDelegate does all + of the heavy lifting.""" + + def __init__(self, revision_reader, target): + self.target = target + + # Since the output of this run is a repository, not a dumpfile, + # the temporary dumpfiles we create should go in the tmpdir. But + # since we delete it ourselves, we don't want to use + # artifact_manager. + DumpfileDelegate.__init__( + self, revision_reader, Ctx().get_temp_filename(DUMPFILE) + ) + + self.dumpfile = open(self.dumpfile_path, 'w+b') + self.loader_pipe = subprocess.Popen( + [Ctx().svnadmin_executable, 'load', '-q', self.target], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + self.loader_pipe.stdout.close() + try: + self._write_dumpfile_header(self.loader_pipe.stdin) + except IOError: + raise FatalError( + 'svnadmin failed with the following output while ' + 'loading the dumpfile:\n%s' + % (self.loader_pipe.stderr.read(),) + ) + + def start_commit(self, revnum, revprops): + """Start a new commit.""" + + DumpfileDelegate.start_commit(self, revnum, revprops) + + def end_commit(self): + """Feed the revision stored in the dumpfile to the svnadmin load pipe.""" + + DumpfileDelegate.end_commit(self) + + self.dumpfile.seek(0) + while True: + data = self.dumpfile.read(128*1024) # Chunk size is arbitrary + if not data: + break + try: + self.loader_pipe.stdin.write(data) + except IOError: + raise FatalError("svnadmin failed with the following output " + "while loading the dumpfile:\n" + + self.loader_pipe.stderr.read()) + self.dumpfile.seek(0) + self.dumpfile.truncate() + + def finish(self): + """Clean up.""" + + self.dumpfile.close() + self.loader_pipe.stdin.close() + error_output = self.loader_pipe.stderr.read() + exit_status = self.loader_pipe.wait() + del self.loader_pipe + if exit_status: + raise CommandError('svnadmin load', exit_status, error_output) + os.remove(self.dumpfile_path) + + diff --git a/cvs2svn_lib/repository_mirror.py b/cvs2svn_lib/repository_mirror.py new file mode 100644 index 0000000..72e2ba1 --- /dev/null +++ b/cvs2svn_lib/repository_mirror.py @@ -0,0 +1,897 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the RepositoryMirror class and supporting classes. + +RepositoryMirror represents the skeleton of a versioned file tree with +multiple lines of development ('LODs'). It records the presence or +absence of files and directories, but not their contents. Given three +values (revnum, lod, cvs_path), it can tell you whether the specified +CVSPath existed on the specified LOD in the given revision number. +The file trees corresponding to the most recent revision can be +modified. + +The individual file trees are stored using immutable tree structures. +Each directory node is represented as a MirrorDirectory instance, +which is basically a map {cvs_path : node_id}, where cvs_path is a +CVSPath within the directory, and node_id is an integer ID that +uniquely identifies another directory node if that node is a +CVSDirectory, or None if that node is a CVSFile. If a directory node +is to be modified, then first a new node is created with a copy of the +original node's contents, then the copy is modified. A reference to +the copy also has to be stored in the parent node, meaning that the +parent node needs to be modified, and so on recursively to the root +node of the file tree. This data structure allows cheap deep copies, +which is useful for tagging and branching. + +The class must also be able to find the root directory node +corresponding to a particular (revnum, lod). This is done by keeping +an LODHistory instance for each LOD, which can determine the root +directory node ID for that LOD for any revnum. It does so by +recording changes to the root directory node ID only for revisions in +which it changed. Thus it stores two arrays, revnums (a list of the +revision numbers when the ID changed), and ids (a list of the +corresponding IDs). To find the ID for a particular revnum, first a +binary search is done in the revnums array to find the index of the +last change preceding revnum, then the corresponding ID is read from +the ids array. Since most revisions change only one LOD, this allows +storage of the history of potentially tens of thousands of LODs over +hundreds of thousands of revisions in an amount of space that scales +as O(numberOfLODs + numberOfRevisions), rather than O(numberOfLODs * +numberOfRevisions) as would be needed if the information were stored +in the equivalent of a 2D array. + +The internal operation of these classes is somewhat intricate, but the +interface attempts to hide the complexity, enforce the usage rules, +and allow efficient access. The most important facts to remember are +(1) that a directory node can be used for multiple purposes (for +multiple branches and for multiple revisions on a single branch), (2) +that only a node that has been created within the current revision is +allowed to be mutated, and (3) that the current revision can include +nodes carried over from prior revisions, which are immutable. + +This leads to a bewildering variety of MirrorDirectory classes. The +most important distinction is between OldMirrorDirectories and +CurrentMirrorDirectories. A single node can be represented multiple +ways in memory at the same time, depending on whether it was looked up +as part of the current revision or part of an old revision: + + MirrorDirectory -- the base class for all MirrorDirectory nodes. + This class allows lookup of subnodes and iteration over + subnodes. + + OldMirrorDirectory -- a MirrorDirectory that was looked up for an + old revision. These instances are immutable, as only the + current revision is allowed to be modified. + + CurrentMirrorDirectory -- a MirrorDirectory that was looked up for + the current revision. Such an instance is always logically + mutable, though mutating it might require the node to be + copied first. Such an instance might represent a node that + has already been copied during this revision and can therefore + be modified freely (such nodes implement + _WritableMirrorDirectoryMixin), or it might represent a node + that was carried over from an old revision and hasn't been + copied yet (such nodes implement + _ReadOnlyMirrorDirectoryMixin). If the latter, then the node + copies itself (and bubbles up the change) before allowing + itself to be modified. But the distinction is managed + internally; client classes should not have to worry about it. + + CurrentMirrorLODDirectory -- A CurrentMirrorDirectory representing + the root directory of a line of development in the current + revision. This class has two concrete subclasses, + _CurrentMirrorReadOnlyLODDirectory and + _CurrentMirrorWritableLODDirectory, depending on whether the + node has already been copied during this revision. + + + CurrentMirrorSubdirectory -- A CurrentMirrorDirectory representing + a subdirectory within a line of development's directory tree + in the current revision. This class has two concrete + subclasses, _CurrentMirrorReadOnlySubdirectory and + _CurrentMirrorWritableSubdirectory, depending on whether the + node has already been copied during this revision. + + DeletedCurrentMirrorDirectory -- a MirrorDirectory that has been + deleted. Such an instance is disabled so that it cannot + accidentally be used. + +While a revision is being processed, RepositoryMirror._new_nodes holds +every writable CurrentMirrorDirectory instance (i.e., every node that +has been created in the revision). Since these nodes are mutable, it +is important that there be exactly one instance associated with each +node; otherwise there would be problems keeping the instances +synchronized. These are written to the database by +RepositoryMirror.end_commit(). + +OldMirrorDirectory and read-only CurrentMirrorDirectory instances are +*not* cached; they are recreated whenever they are referenced. There +might be multiple instances referring to the same node. A read-only +CurrentMirrorDirectory instance is mutated in place into a writable +CurrentMirrorDirectory instance if it needs to be modified. + +FIXME: The rules for when a MirrorDirectory instance can continue to +be used vs. when it has to be read again (because it has been modified +indirectly and therefore copied) are confusing and error-prone. +Probably the semantics should be changed. + +""" + + +import bisect + +from cvs2svn_lib import config +from cvs2svn_lib.common import DB_OPEN_NEW +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.cvs_file import CVSFile +from cvs2svn_lib.cvs_file import CVSDirectory +from cvs2svn_lib.key_generator import KeyGenerator +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.serializer import MarshalSerializer +from cvs2svn_lib.database import IndexedDatabase + + +class RepositoryMirrorError(Exception): + """An error related to the RepositoryMirror.""" + + pass + + +class LODExistsError(RepositoryMirrorError): + """The LOD already exists in the repository. + + Exception raised if an attempt is made to add an LOD to the + repository mirror and that LOD already exists in the youngest + revision of the repository.""" + + pass + + +class PathExistsError(RepositoryMirrorError): + """The path already exists in the repository. + + Exception raised if an attempt is made to add a path to the + repository mirror and that path already exists in the youngest + revision of the repository.""" + + pass + + +class DeletedNodeReusedError(RepositoryMirrorError): + """The MirrorDirectory has already been deleted and shouldn't be reused.""" + + pass + + +class CopyFromCurrentNodeError(RepositoryMirrorError): + """A CurrentMirrorDirectory cannot be copied to the current revision.""" + + pass + + +class MirrorDirectory(object): + """Represent a node within the RepositoryMirror. + + Instances of this class act like a map {CVSPath : MirrorDirectory}, + where CVSPath is an item within this directory (i.e., a file or + subdirectory within this directory). The value is either another + MirrorDirectory instance (for directories) or None (for files).""" + + def __init__(self, repo, id, entries): + # The RepositoryMirror containing this directory: + self.repo = repo + + # The id of this node: + self.id = id + + # The entries within this directory, stored as a map {CVSPath : + # node_id}. The node_ids are integers for CVSDirectories, None + # for CVSFiles: + self._entries = entries + + def __getitem__(self, cvs_path): + """Return the MirrorDirectory associated with the specified subnode. + + Return a MirrorDirectory instance if the subnode is a + CVSDirectory; None if it is a CVSFile. Raise KeyError if the + specified subnode does not exist.""" + + raise NotImplementedError() + + def __len__(self): + """Return the number of CVSPaths within this node.""" + + return len(self._entries) + + def __contains__(self, cvs_path): + """Return True iff CVS_PATH is contained in this node.""" + + return cvs_path in self._entries + + def __iter__(self): + """Iterate over the CVSPaths within this node.""" + + return self._entries.__iter__() + + def _format_entries(self): + """Format the entries map for output in subclasses' __repr__() methods.""" + + def format_item(key, value): + if value is None: + return str(key) + else: + return '%s -> %x' % (key, value,) + + items = self._entries.items() + items.sort() + return '{%s}' % (', '.join([format_item(*item) for item in items]),) + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s<%x>' % (self.__class__.__name__, self.id,) + + +class OldMirrorDirectory(MirrorDirectory): + """Represent a historical directory within the RepositoryMirror.""" + + def __getitem__(self, cvs_path): + id = self._entries[cvs_path] + if id is None: + # This represents a leaf node. + return None + else: + return OldMirrorDirectory(self.repo, id, self.repo._node_db[id]) + + def __repr__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s(%s)' % (self, self._format_entries(),) + + +class CurrentMirrorDirectory(MirrorDirectory): + """Represent a directory that currently exists in the RepositoryMirror.""" + + def __init__(self, repo, id, lod, cvs_path, entries): + MirrorDirectory.__init__(self, repo, id, entries) + self.lod = lod + self.cvs_path = cvs_path + + def __getitem__(self, cvs_path): + id = self._entries[cvs_path] + if id is None: + # This represents a leaf node. + return None + else: + try: + return self.repo._new_nodes[id] + except KeyError: + return _CurrentMirrorReadOnlySubdirectory( + self.repo, id, self.lod, cvs_path, self, + self.repo._node_db[id] + ) + + def __setitem__(self, cvs_path, node): + """Create or overwrite a subnode of this node. + + CVS_PATH is the path of the subnode. NODE will be the new value + of the node; for CVSDirectories it should be a MirrorDirectory + instance; for CVSFiles it should be None.""" + + if isinstance(node, DeletedCurrentMirrorDirectory): + raise DeletedNodeReusedError( + '%r has already been deleted and should not be reused' % (node,) + ) + elif isinstance(node, CurrentMirrorDirectory): + raise CopyFromCurrentNodeError( + '%r was created in the current node and cannot be copied' % (node,) + ) + else: + self._set_entry(cvs_path, node) + + def __delitem__(self, cvs_path): + """Remove the subnode of this node at CVS_PATH. + + If the node does not exist, then raise a KeyError.""" + + node = self[cvs_path] + self._del_entry(cvs_path) + if isinstance(node, _WritableMirrorDirectoryMixin): + node._mark_deleted() + + def mkdir(self, cvs_directory): + """Create an empty subdirectory of this node at CVS_PATH. + + Return the CurrentDirectory that was created.""" + + assert isinstance(cvs_directory, CVSDirectory) + if cvs_directory in self: + raise PathExistsError( + 'Attempt to create directory \'%s\' in %s in repository mirror ' + 'when it already exists.' + % (cvs_directory, self.lod,) + ) + + new_node = _CurrentMirrorWritableSubdirectory( + self.repo, self.repo._key_generator.gen_id(), self.lod, cvs_directory, + self, {} + ) + self._set_entry(cvs_directory, new_node) + self.repo._new_nodes[new_node.id] = new_node + return new_node + + def add_file(self, cvs_file): + """Create a file within this node at CVS_FILE.""" + + assert isinstance(cvs_file, CVSFile) + if cvs_file in self: + raise PathExistsError( + 'Attempt to create file \'%s\' in %s in repository mirror ' + 'when it already exists.' + % (cvs_file, self.lod,) + ) + + self._set_entry(cvs_file, None) + + def __repr__(self): + """For convenience only. The format is subject to change at any time.""" + + return '%s(%r, %r, %s)' % ( + self, self.lod, self.cvs_path, self._format_entries(), + ) + + +class DeletedCurrentMirrorDirectory(object): + """A MirrorDirectory that has been deleted. + + A MirrorDirectory that used to be a _WritableMirrorDirectoryMixin + but then was deleted. Such instances are turned into this class so + that nobody can accidentally mutate them again.""" + + pass + + +class _WritableMirrorDirectoryMixin: + """Mixin for MirrorDirectories that are already writable. + + A MirrorDirectory is writable if it has already been recreated + during the current revision.""" + + def _set_entry(self, cvs_path, node): + """Create or overwrite a subnode of this node, with no checks.""" + + if node is None: + self._entries[cvs_path] = None + else: + self._entries[cvs_path] = node.id + + def _del_entry(self, cvs_path): + """Remove the subnode of this node at CVS_PATH, with no checks.""" + + del self._entries[cvs_path] + + def _mark_deleted(self): + """Mark this object and any writable descendants as being deleted.""" + + self.__class__ = DeletedCurrentMirrorDirectory + + for (cvs_path, id) in self._entries.iteritems(): + if id in self.repo._new_nodes: + node = self[cvs_path] + if isinstance(node, _WritableMirrorDirectoryMixin): + # Mark deleted and recurse: + node._mark_deleted() + + +class _ReadOnlyMirrorDirectoryMixin: + """Mixin for a CurrentMirrorDirectory that hasn't yet been made writable.""" + + def _make_writable(self): + raise NotImplementedError() + + def _set_entry(self, cvs_path, node): + """Create or overwrite a subnode of this node, with no checks.""" + + self._make_writable() + self._set_entry(cvs_path, node) + + def _del_entry(self, cvs_path): + """Remove the subnode of this node at CVS_PATH, with no checks.""" + + self._make_writable() + self._del_entry(cvs_path) + + +class CurrentMirrorLODDirectory(CurrentMirrorDirectory): + """Represent an LOD's main directory in the mirror's current version.""" + + def __init__(self, repo, id, lod, entries): + CurrentMirrorDirectory.__init__( + self, repo, id, lod, lod.project.get_root_cvs_directory(), entries + ) + + def delete(self): + """Remove the directory represented by this object.""" + + lod_history = self.repo._get_lod_history(self.lod) + assert lod_history.exists() + lod_history.update(self.repo._youngest, None) + self._mark_deleted() + + +class _CurrentMirrorReadOnlyLODDirectory( + CurrentMirrorLODDirectory, _ReadOnlyMirrorDirectoryMixin + ): + """Represent an LOD's main directory in the mirror's current version.""" + + def _make_writable(self): + self.__class__ = _CurrentMirrorWritableLODDirectory + # Create a new ID: + self.id = self.repo._key_generator.gen_id() + self.repo._new_nodes[self.id] = self + self.repo._get_lod_history(self.lod).update(self.repo._youngest, self.id) + self._entries = self._entries.copy() + + +class _CurrentMirrorWritableLODDirectory( + CurrentMirrorLODDirectory, _WritableMirrorDirectoryMixin + ): + pass + + +class CurrentMirrorSubdirectory(CurrentMirrorDirectory): + """Represent a subdirectory in the mirror's current version.""" + + def __init__(self, repo, id, lod, cvs_path, parent_mirror_dir, entries): + CurrentMirrorDirectory.__init__(self, repo, id, lod, cvs_path, entries) + self.parent_mirror_dir = parent_mirror_dir + + def delete(self): + """Remove the directory represented by this object.""" + + del self.parent_mirror_dir[self.cvs_path] + + +class _CurrentMirrorReadOnlySubdirectory( + CurrentMirrorSubdirectory, _ReadOnlyMirrorDirectoryMixin + ): + """Represent a subdirectory in the mirror's current version.""" + + def _make_writable(self): + self.__class__ = _CurrentMirrorWritableSubdirectory + # Create a new ID: + self.id = self.repo._key_generator.gen_id() + self.repo._new_nodes[self.id] = self + self.parent_mirror_dir._set_entry(self.cvs_path, self) + self._entries = self._entries.copy() + + +class _CurrentMirrorWritableSubdirectory( + CurrentMirrorSubdirectory, _WritableMirrorDirectoryMixin + ): + pass + + +class LODHistory(object): + """The history of root nodes for a line of development. + + Members: + + _mirror -- (RepositoryMirror) the RepositoryMirror that manages + this LODHistory. + + lod -- (LineOfDevelopment) the LOD described by this LODHistory. + + revnums -- (list of int) the revision numbers in which the id + changed, in numerical order. + + ids -- (list of (int or None)) the ID of the node describing the + root of this LOD starting at the corresponding revision + number, or None if the LOD did not exist in that revision. + + To find the root id for a given revision number, a binary search is + done within REVNUMS to find the index of the most recent revision at + the time of REVNUM, then that index is used to read the id out of + IDS. + + A sentry is written at the zeroth index of both arrays to describe + the initial situation, namely, that the LOD doesn't exist in + revision r0.""" + + __slots__ = ['_mirror', 'lod', 'revnums', 'ids'] + + def __init__(self, mirror, lod): + self._mirror = mirror + self.lod = lod + self.revnums = [0] + self.ids = [None] + + def get_id(self, revnum): + """Get the ID of the root path for this LOD in REVNUM. + + Raise KeyError if this LOD didn't exist in REVNUM.""" + + index = bisect.bisect_right(self.revnums, revnum) - 1 + id = self.ids[index] + + if id is None: + raise KeyError() + + return id + + def get_current_id(self): + """Get the ID of the root path for this LOD in the current revision. + + Raise KeyError if this LOD doesn't currently exist.""" + + id = self.ids[-1] + + if id is None: + raise KeyError() + + return id + + def exists(self): + """Return True iff LOD exists in the current revision.""" + + return self.ids[-1] is not None + + def update(self, revnum, id): + """Indicate that the root node of this LOD changed to ID at REVNUM. + + REVNUM is a revision number that must be the same as that of the + previous recorded change (in which case the previous change is + overwritten) or later (in which the new change is appended). + + ID can be a node ID, or it can be None to indicate that this LOD + ceased to exist in REVNUM.""" + + if revnum < self.revnums[-1]: + raise KeyError() + elif revnum == self.revnums[-1]: + # This is an attempt to overwrite an entry that was already + # updated during this revision. Don't allow the replacement + # None -> None or allow one new id to be replaced with another: + old_id = self.ids[-1] + if old_id is None and id is None: + raise InternalError( + 'ID changed from None -> None for %s, r%d' % (self.lod, revnum,) + ) + elif (old_id is not None and id is not None + and old_id in self._mirror._new_nodes): + raise InternalError( + 'ID changed from %x -> %x for %s, r%d' + % (old_id, id, self.lod, revnum,) + ) + self.ids[-1] = id + else: + self.revnums.append(revnum) + self.ids.append(id) + + +class _NodeDatabase(object): + """A database storing all of the directory nodes. + + The nodes are written in groups every time write_new_nodes() is + called. To the database is written a dictionary {node_id : + [(cvs_path.id, node_id),...]}, where the keys are the node_ids of + the new nodes. When a node is read, its whole group is read and + cached under the assumption that the other nodes in the group are + likely to be needed soon. The cache is retained across revisions + and cleared when _cache_max_size is exceeded. + + The dictionaries for nodes that have been read from the database + during the current revision are cached by node_id in the _cache + member variable. The corresponding dictionaries are *not* copied + when read. To avoid cross-talk between distinct MirrorDirectory + instances that have the same node_id, users of these dictionaries + have to copy them before modification.""" + + # How many entries should be allowed in the cache for each + # CVSDirectory in the repository. (This number is very roughly the + # number of complete lines of development that can be stored in the + # cache at one time.) + CACHE_SIZE_MULTIPLIER = 5 + + # But the cache will never be limited to less than this number: + MIN_CACHE_LIMIT = 5000 + + def __init__(self): + self.cvs_file_db = Ctx()._cvs_file_db + self.db = IndexedDatabase( + artifact_manager.get_temp_file(config.MIRROR_NODES_STORE), + artifact_manager.get_temp_file(config.MIRROR_NODES_INDEX_TABLE), + DB_OPEN_NEW, serializer=MarshalSerializer(), + ) + + # A list of the maximum node_id stored by each call to + # write_new_nodes(): + self._max_node_ids = [0] + + # A map {node_id : {cvs_path : node_id}}: + self._cache = {} + + # The number of directories in the repository: + num_dirs = len([ + cvs_path + for cvs_path in self.cvs_file_db.itervalues() + if isinstance(cvs_path, CVSDirectory) + ]) + + self._cache_max_size = max( + int(self.CACHE_SIZE_MULTIPLIER * num_dirs), + self.MIN_CACHE_LIMIT, + ) + + def _load(self, items): + retval = {} + for (id, value) in items: + retval[self.cvs_file_db.get_file(id)] = value + return retval + + def _dump(self, node): + return [ + (cvs_path.id, value) + for (cvs_path, value) in node.iteritems() + ] + + def _determine_index(self, id): + """Return the index of the record holding the node with ID.""" + + return bisect.bisect_left(self._max_node_ids, id) + + def __getitem__(self, id): + try: + items = self._cache[id] + except KeyError: + index = self._determine_index(id) + for (node_id, items) in self.db[index].items(): + self._cache[node_id] = self._load(items) + items = self._cache[id] + + return items + + def write_new_nodes(self, nodes): + """Write NODES to the database. + + NODES is an iterable of writable CurrentMirrorDirectory instances.""" + + if len(self._cache) > self._cache_max_size: + # The size of the cache has exceeded the threshold. Discard the + # old cache values (but still store the new nodes into the + # cache): + Log().debug('Clearing node cache') + self._cache.clear() + + data = {} + max_node_id = 0 + for node in nodes: + max_node_id = max(max_node_id, node.id) + data[node.id] = self._dump(node._entries) + self._cache[node.id] = node._entries + + self.db[len(self._max_node_ids)] = data + + if max_node_id == 0: + # Rewrite last value: + self._max_node_ids.append(self._max_node_ids[-1]) + else: + self._max_node_ids.append(max_node_id) + + def close(self): + self._cache.clear() + self.db.close() + self.db = None + + +class RepositoryMirror: + """Mirror a repository and its history. + + Mirror a repository as it is constructed, one revision at a time. + For each LineOfDevelopment we store a skeleton of the directory + structure within that LOD for each revnum in which it changed. + + For each LOD that has been seen so far, an LODHistory instance is + stored in self._lod_histories. An LODHistory keeps track of each + revnum in which files were added to or deleted from that LOD, as + well as the node id of the root of the node tree describing the LOD + contents at that revision. + + The LOD trees themselves are stored in the _node_db database, which + maps node ids to nodes. A node is a map from CVSPath to ids of the + corresponding subnodes. The _node_db is stored on disk and each + access is expensive. + + The _node_db database only holds the nodes for old revisions. The + revision that is being constructed is kept in memory in the + _new_nodes map, which is cheap to access. + + You must invoke start_commit() before each commit and end_commit() + afterwards.""" + + def register_artifacts(self, which_pass): + """Register the artifacts that will be needed for this object.""" + + artifact_manager.register_temp_file( + config.MIRROR_NODES_INDEX_TABLE, which_pass + ) + artifact_manager.register_temp_file( + config.MIRROR_NODES_STORE, which_pass + ) + + def open(self): + """Set up the RepositoryMirror and prepare it for commits.""" + + self._key_generator = KeyGenerator() + + # A map from LOD to LODHistory instance for all LODs that have + # been referenced so far: + self._lod_histories = {} + + # This corresponds to the 'nodes' table in a Subversion fs. (We + # don't need a 'representations' or 'strings' table because we + # only track file existence, not file contents.) + self._node_db = _NodeDatabase() + + # Start at revision 0 without a root node. + self._youngest = 0 + + def start_commit(self, revnum): + """Start a new commit.""" + + assert revnum > self._youngest + self._youngest = revnum + + # A map {node_id : _WritableMirrorDirectoryMixin}. + self._new_nodes = {} + + def end_commit(self): + """Called at the end of each commit. + + This method copies the newly created nodes to the on-disk nodes + db.""" + + # Copy the new nodes to the _node_db + self._node_db.write_new_nodes([ + node + for node in self._new_nodes.values() + if not isinstance(node, DeletedCurrentMirrorDirectory) + ]) + + del self._new_nodes + + def _get_lod_history(self, lod): + """Return the LODHistory instance describing LOD. + + Create a new (empty) LODHistory if it doesn't yet exist.""" + + try: + return self._lod_histories[lod] + except KeyError: + lod_history = LODHistory(self, lod) + self._lod_histories[lod] = lod_history + return lod_history + + def get_old_lod_directory(self, lod, revnum): + """Return the directory for the root path of LOD at revision REVNUM. + + Return an instance of MirrorDirectory if the path exists; + otherwise, raise KeyError.""" + + lod_history = self._get_lod_history(lod) + id = lod_history.get_id(revnum) + return OldMirrorDirectory(self, id, self._node_db[id]) + + def get_old_path(self, cvs_path, lod, revnum): + """Return the node for CVS_PATH from LOD at REVNUM. + + If CVS_PATH is a CVSDirectory, then return an instance of + OldMirrorDirectory. If CVS_PATH is a CVSFile, return None. + + If CVS_PATH does not exist in the specified LOD and REVNUM, raise + KeyError.""" + + node = self.get_old_lod_directory(lod, revnum) + + for sub_path in cvs_path.get_ancestry()[1:]: + node = node[sub_path] + + return node + + def get_current_lod_directory(self, lod): + """Return the directory for the root path of LOD in the current revision. + + Return an instance of CurrentMirrorDirectory. Raise KeyError if + the path doesn't already exist.""" + + lod_history = self._get_lod_history(lod) + id = lod_history.get_current_id() + try: + return self._new_nodes[id] + except KeyError: + return _CurrentMirrorReadOnlyLODDirectory( + self, id, lod, self._node_db[id] + ) + + def get_current_path(self, cvs_path, lod): + """Return the node for CVS_PATH from LOD in the current revision. + + If CVS_PATH is a CVSDirectory, then return an instance of + CurrentMirrorDirectory. If CVS_PATH is a CVSFile, return None. + + If CVS_PATH does not exist in the current revision of the + specified LOD, raise KeyError.""" + + node = self.get_current_lod_directory(lod) + + for sub_path in cvs_path.get_ancestry()[1:]: + node = node[sub_path] + + return node + + def add_lod(self, lod): + """Create a new LOD in this repository. + + Return the CurrentMirrorDirectory that was created. If the LOD + already exists, raise LODExistsError.""" + + lod_history = self._get_lod_history(lod) + if lod_history.exists(): + raise LODExistsError( + 'Attempt to create %s in repository mirror when it already exists.' + % (lod,) + ) + new_node = _CurrentMirrorWritableLODDirectory( + self, self._key_generator.gen_id(), lod, {} + ) + lod_history.update(self._youngest, new_node.id) + self._new_nodes[new_node.id] = new_node + return new_node + + def copy_lod(self, src_lod, dest_lod, src_revnum): + """Copy all of SRC_LOD at SRC_REVNUM to DST_LOD. + + In the youngest revision of the repository, the destination LOD + *must not* already exist. + + Return the new node at DEST_LOD, as a CurrentMirrorDirectory.""" + + # Get the node of our src_path + src_node = self.get_old_lod_directory(src_lod, src_revnum) + + dest_lod_history = self._get_lod_history(dest_lod) + if dest_lod_history.exists(): + raise LODExistsError( + 'Attempt to copy to %s in repository mirror when it already exists.' + % (dest_lod,) + ) + + dest_lod_history.update(self._youngest, src_node.id) + + # Return src_node, except packaged up as a CurrentMirrorDirectory: + return self.get_current_lod_directory(dest_lod) + + def close(self): + """Free resources and close databases.""" + + self._lod_histories = None + self._node_db.close() + self._node_db = None + + diff --git a/cvs2svn_lib/revision_manager.py b/cvs2svn_lib/revision_manager.py new file mode 100644 index 0000000..8af7c74 --- /dev/null +++ b/cvs2svn_lib/revision_manager.py @@ -0,0 +1,189 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module describes the interface to the CVS repository.""" + + +class RevisionRecorder: + """An object that can record text and deltas from CVS files.""" + + def __init__(self): + """Initialize the RevisionRecorder. + + Please note that a RevisionRecorder is instantiated in every + program run, even if the data-collection pass will not be + executed. (This is to allow it to register the artifacts that it + produces.) Therefore, the __init__() method should not do much, + and more substantial preparation for use (like actually creating + the artifacts) should be done in start().""" + + pass + + def register_artifacts(self, which_pass): + """Register artifacts that will be needed during data recording. + + WHICH_PASS is the pass that will call our callbacks, so it should + be used to do the registering (e.g., call + WHICH_PASS.register_temp_file() and/or + WHICH_PASS.register_temp_file_needed()).""" + + pass + + def start(self): + """Data will soon start being collected. + + Any non-idempotent initialization should be done here.""" + + pass + + def start_file(self, cvs_file_items): + """Prepare to receive data for the file with the specified CVS_FILE_ITEMS. + + CVS_FILE_ITEMS is an instance of CVSFileItems describing the file + dependency topology right after the file tree was parsed out of + the RCS file. (I.e., it reflects the original CVS dependency + structure.) Please note that the CVSFileItems instance will be + changed later.""" + + pass + + def record_text(self, cvs_rev, log, text): + """Record information about a revision and optionally return a token. + + CVS_REV is a CVSRevision instance describing a revision that has + log message LOG and text TEXT (as retrieved from the RCS file). + (TEXT is full text for the HEAD revision, and deltas for other + revisions.)""" + + raise NotImplementedError() + + def finish_file(self, cvs_file_items): + """The current file is finished; finish and clean up. + + CVS_FILE_ITEMS is a CVSFileItems instance describing the file's + items at the end of processing of the RCS file in CollectRevsPass. + It may be modified relative to the CVS_FILE_ITEMS instance passed + to the corresponding start_file() call (revisions might be + deleted, topology changed, etc).""" + + pass + + def finish(self): + """All recording is done; clean up.""" + + pass + + +class NullRevisionRecorder(RevisionRecorder): + """A do-nothing variety of RevisionRecorder.""" + + def record_text(self, cvs_rev, log, text): + return None + + +class RevisionExcluder: + """An interface for informing a RevisionReader about excluded revisions. + + Currently, revisions can be excluded via the --exclude option and + various fixups for CVS peculiarities. This interface can be used to + inform the associated RevisionReader about CVSItems that are being + excluded. (The recorder might use that information to free some + temporary data or adjust its expectations about which revisions will + later be read.)""" + + def __init__(self): + """Initialize the RevisionExcluder. + + Please note that a RevisionExcluder is instantiated in every + program run, even if the branch-exclusion pass will not be + executed. (This is to allow its register_artifacts() method to be + called.) Therefore, the __init__() method should not do much, and + more substantial preparation for use (like actually creating the + artifacts) should be done in start().""" + + pass + + def register_artifacts(self, which_pass): + """Register artifacts that will be needed during branch exclusion. + + WHICH_PASS is the pass that will call our callbacks, so it should + be used to do the registering (e.g., call + WHICH_PASS.register_temp_file() and/or + WHICH_PASS.register_temp_file_needed()).""" + + pass + + def start(self): + """Prepare to handle branch exclusions.""" + + pass + + def process_file(self, cvs_file_items): + """Called for files whose trees were modified in FilterSymbolsPass. + + This callback is called once for each CVSFile whose topology was + modified in FilterSymbolsPass.""" + + raise NotImplementedError() + + def finish(self): + """Called after all branch exclusions for all files are done.""" + + pass + + +class NullRevisionExcluder(RevisionExcluder): + """A do-nothing variety of RevisionExcluder.""" + + def process_file(self, cvs_file_items): + pass + + +class RevisionReader(object): + """An object that can read the contents of CVSRevisions.""" + + def register_artifacts(self, which_pass): + """Register artifacts that will be needed during branch exclusion. + + WHICH_PASS is the pass that will call our callbacks, so it should + be used to do the registering (e.g., call + WHICH_PASS.register_temp_file() and/or + WHICH_PASS.register_temp_file_needed()).""" + + pass + + def start(self): + """Prepare for calls to get_content_stream.""" + + pass + + def get_content_stream(self, cvs_rev, suppress_keyword_substitution=False): + """Return a file-like object from which the contents of CVS_REV + can be read. + + CVS_REV is a CVSRevision. If SUPPRESS_KEYWORD_SUBSTITUTION is + True, then suppress the substitution of RCS/CVS keywords in the + output.""" + + raise NotImplementedError + + def finish(self): + """Inform the reader that all calls to get_content_stream are done. + Start may be called again at a later point.""" + + pass + + diff --git a/cvs2svn_lib/run_options.py b/cvs2svn_lib/run_options.py new file mode 100644 index 0000000..27d2ea6 --- /dev/null +++ b/cvs2svn_lib/run_options.py @@ -0,0 +1,1035 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to set common cvs2xxx run options.""" + +import sys +import re +import optparse +from optparse import OptionGroup +import time + +from cvs2svn_lib.version import VERSION +from cvs2svn_lib import config +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import CVSTextDecoder +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.man_writer import ManOption +from cvs2svn_lib.pass_manager import InvalidPassError +from cvs2svn_lib.symbol_strategy import AllBranchRule +from cvs2svn_lib.symbol_strategy import AllTagRule +from cvs2svn_lib.symbol_strategy import BranchIfCommitsRule +from cvs2svn_lib.symbol_strategy import ExcludeRegexpStrategyRule +from cvs2svn_lib.symbol_strategy import ForceBranchRegexpStrategyRule +from cvs2svn_lib.symbol_strategy import ForceTagRegexpStrategyRule +from cvs2svn_lib.symbol_strategy import ExcludeTrivialImportBranchRule +from cvs2svn_lib.symbol_strategy import HeuristicStrategyRule +from cvs2svn_lib.symbol_strategy import UnambiguousUsageRule +from cvs2svn_lib.symbol_strategy import HeuristicPreferredParentRule +from cvs2svn_lib.symbol_strategy import SymbolHintsFileRule +from cvs2svn_lib.symbol_transform import ReplaceSubstringsSymbolTransform +from cvs2svn_lib.symbol_transform import RegexpSymbolTransform +from cvs2svn_lib.symbol_transform import NormalizePathsSymbolTransform +from cvs2svn_lib.property_setters import AutoPropsPropertySetter +from cvs2svn_lib.property_setters import CVSBinaryFileDefaultMimeTypeSetter +from cvs2svn_lib.property_setters import CVSBinaryFileEOLStyleSetter +from cvs2svn_lib.property_setters import CVSRevisionNumberSetter +from cvs2svn_lib.property_setters import DefaultEOLStyleSetter +from cvs2svn_lib.property_setters import EOLStyleFromMimeTypeSetter +from cvs2svn_lib.property_setters import ExecutablePropertySetter +from cvs2svn_lib.property_setters import KeywordsPropertySetter +from cvs2svn_lib.property_setters import MimeMapper +from cvs2svn_lib.property_setters import SVNBinaryFileKeywordsPropertySetter + + +usage = """\ +Usage: %prog --options OPTIONFILE + %prog [OPTION...] OUTPUT-OPTION CVS-REPOS-PATH""" + +description="""\ +Convert a CVS repository into a Subversion repository, including history. +""" + +authors = u"""\ +Main authors are: +.br +C. Michael Pilato +.br +Greg Stein +.br +Branko \u010cibej +.br +Blair Zajac +.br +Max Bowsher +.br +Brian Fitzpatrick +.br +Tobias Ringstr\u00f6m +.br +Karl Fogel +.br +Erik H\u00fclsmann +.br +David Summers +.br +Michael Haggerty +.PP +Manpage was written for the Debian GNU/Linux system by +Laszlo 'GCS' Boszormenyi (but may be used by others). +""" + + +class IncompatibleOption(ManOption): + """A ManOption that is incompatible with the --options option. + + Record that the option was used so that error checking can later be + done.""" + + def __init__(self, *args, **kw): + ManOption.__init__(self, *args, **kw) + + def take_action(self, action, dest, opt, value, values, parser): + oio = parser.values.options_incompatible_options + if opt not in oio: + oio.append(opt) + return ManOption.take_action( + self, action, dest, opt, value, values, parser + ) + + +class ContextOption(ManOption): + """A ManOption that stores its value to Ctx.""" + + def __init__(self, *args, **kw): + if kw.get('action') not in self.STORE_ACTIONS: + raise ValueError('Invalid action: %s' % (kw['action'],)) + + self.__compatible_with_option = kw.pop('compatible_with_option', False) + self.__action = kw.pop('action') + try: + self.__dest = kw.pop('dest') + except KeyError: + opt = args[0] + if not opt.startswith('--'): + raise ValueError + self.__dest = opt[2:].replace('-', '_') + if 'const' in kw: + self.__const = kw.pop('const') + + kw['action'] = 'callback' + kw['callback'] = self.__callback + + ManOption.__init__(self, *args, **kw) + + def __callback(self, option, opt_str, value, parser): + if not self.__compatible_with_option: + oio = parser.values.options_incompatible_options + if opt_str not in oio: + oio.append(opt_str) + + action = self.__action + dest = self.__dest + + if action == "store": + setattr(Ctx(), dest, value) + elif action == "store_const": + setattr(Ctx(), dest, self.__const) + elif action == "store_true": + setattr(Ctx(), dest, True) + elif action == "store_false": + setattr(Ctx(), dest, False) + elif action == "append": + getattr(Ctx(), dest).append(value) + elif action == "count": + setattr(Ctx(), dest, getattr(Ctx(), dest, 0) + 1) + else: + raise RuntimeError("unknown action %r" % self.__action) + + return 1 + + +class IncompatibleOptionsException(FatalError): + pass + + +# Options that are not allowed to be used with --trunk-only: +SYMBOL_OPTIONS = [ + '--symbol-transform', + '--symbol-hints', + '--force-branch', + '--force-tag', + '--exclude', + '--keep-trivial-imports', + '--symbol-default', + '--no-cross-branch-commits', + ] + +class SymbolOptionsWithTrunkOnlyException(IncompatibleOptionsException): + def __init__(self): + IncompatibleOptionsException.__init__( + self, + 'The following symbol-related options cannot be used together\n' + 'with --trunk-only:\n' + ' %s' + % ('\n '.join(SYMBOL_OPTIONS),) + ) + + +def not_both(opt1val, opt1name, opt2val, opt2name): + """Raise an exception if both opt1val and opt2val are set.""" + if opt1val and opt2val: + raise IncompatibleOptionsException( + "cannot pass both '%s' and '%s'." % (opt1name, opt2name,) + ) + + +class RunOptions(object): + """A place to store meta-options that are used to start the conversion.""" + + def __init__(self, progname, cmd_args, pass_manager): + """Process the command-line options, storing run options to SELF. + + PROGNAME is the name of the program, used in the usage string. + CMD_ARGS is the list of command-line arguments passed to the + program. PASS_MANAGER is an instance of PassManager, needed to + help process the -p and --help-passes options.""" + + self.progname = progname + self.cmd_args = cmd_args + self.pass_manager = pass_manager + self.start_pass = 1 + self.end_pass = self.pass_manager.num_passes + self.profiling = False + + self.projects = [] + + # A list of one list of SymbolStrategyRules for each project: + self.project_symbol_strategy_rules = [] + + parser = self.parser = optparse.OptionParser( + usage=usage, + description=self.get_description(), + add_help_option=False, + ) + # A place to record any options used that are incompatible with + # --options: + parser.set_default('options_incompatible_options', []) + + # Populate the options parser with the options, one group at a + # time: + parser.add_option_group(self._get_options_file_options_group()) + parser.add_option_group(self._get_output_options_group()) + parser.add_option_group(self._get_conversion_options_group()) + parser.add_option_group(self._get_symbol_handling_options_group()) + parser.add_option_group(self._get_subversion_properties_options_group()) + parser.add_option_group(self._get_extraction_options_group()) + parser.add_option_group(self._get_environment_options_group()) + parser.add_option_group(self._get_partial_conversion_options_group()) + parser.add_option_group(self._get_information_options_group()) + + (self.options, self.args) = parser.parse_args(args=self.cmd_args) + + # Now the log level has been set; log the time when the run started: + Log().verbose( + time.strftime( + 'Conversion start time: %Y-%m-%d %I:%M:%S %Z', + time.localtime(Log().start_time) + ) + ) + + if self.options.options_file_found: + # Check that no options that are incompatible with --options + # were used: + self.verify_option_compatibility() + else: + # --options was not specified. So do the main initialization + # based on other command-line options: + self.process_options() + + # Check for problems with the options: + self.check_options() + + def get_description(self): + return description + + def _get_options_file_options_group(self): + group = OptionGroup( + self.parser, 'Configuration via options file' + ) + self.parser.set_default('options_file_found', False) + group.add_option(ManOption( + '--options', type='string', + action='callback', callback=self.callback_options, + help=( + 'read the conversion options from PATH. This ' + 'method allows more flexibility than using ' + 'command-line options. See documentation for info' + ), + man_help=( + 'Read the conversion options from \\fIpath\\fR instead of from ' + 'the command line. This option allows far more conversion ' + 'flexibility than can be achieved using the command-line alone. ' + 'See the documentation for more information. Only the following ' + 'command-line options are allowed in combination with ' + '\\fB--options\\fR: \\fB-h\\fR/\\fB--help\\fR, ' + '\\fB--help-passes\\fR, \\fB--version\\fR, ' + '\\fB-v\\fR/\\fB--verbose\\fR, \\fB-q\\fR/\\fB--quiet\\fR, ' + '\\fB-p\\fR/\\fB--pass\\fR/\\fB--passes\\fR, \\fB--dry-run\\fR, ' + '\\fB--profile\\fR, \\fB--sort\\fR, \\fB--trunk-only\\fR, ' + '\\fB--encoding\\fR, and \\fB--fallback-encoding\\fR. ' + 'Options are processed in the order specified on the command ' + 'line.' + ), + metavar='PATH', + )) + return group + + def _get_output_options_group(self): + group = OptionGroup(self.parser, 'Output options') + return group + + def _get_conversion_options_group(self): + group = OptionGroup(self.parser, 'Conversion options') + group.add_option(ContextOption( + '--trunk-only', + action='store_true', + compatible_with_option=True, + help='convert only trunk commits, not tags nor branches', + man_help=( + 'Convert only trunk commits, not tags nor branches.' + ), + )) + group.add_option(ManOption( + '--encoding', type='string', + action='callback', callback=self.callback_encoding, + help=( + 'encoding for paths and log messages in CVS repos. ' + 'If option is specified multiple times, encoders ' + 'are tried in order until one succeeds. See ' + 'http://docs.python.org/lib/standard-encodings.html ' + 'for a list of standard Python encodings.' + ), + man_help=( + 'Use \\fIencoding\\fR as the encoding for filenames, log ' + 'messages, and author names in the CVS repos. This option ' + 'may be specified multiple times, in which case the encodings ' + 'are tried in order until one succeeds. Default: ascii. See ' + 'http://docs.python.org/lib/standard-encodings.html for a list ' + 'of other standard encodings.' + ), + metavar='ENC', + )) + group.add_option(ManOption( + '--fallback-encoding', type='string', + action='callback', callback=self.callback_fallback_encoding, + help='If all --encodings fail, use lossy encoding with ENC', + man_help=( + 'If none of the encodings specified with \\fB--encoding\\fR ' + 'succeed in decoding an author name or log message, then fall ' + 'back to using \\fIencoding\\fR in lossy \'replace\' mode. ' + 'Use of this option may cause information to be lost, but at ' + 'least it allows the conversion to run to completion. This ' + 'option only affects the encoding of log messages and author ' + 'names; there is no fallback encoding for filenames. (By ' + 'using an \\fB--options\\fR file, it is possible to specify ' + 'a fallback encoding for filenames.) Default: disabled.' + ), + metavar='ENC', + )) + group.add_option(ContextOption( + '--retain-conflicting-attic-files', + action='store_true', + help=( + 'if a file appears both in and out of ' + 'the CVS Attic, then leave the attic version in a ' + 'SVN directory called "Attic"' + ), + man_help=( + 'If a file appears both inside an outside of the CVS attic, ' + 'retain the attic version in an SVN subdirectory called ' + '\'Attic\'. (Normally this situation is treated as a fatal ' + 'error.)' + ), + )) + + return group + + def _get_symbol_handling_options_group(self): + group = OptionGroup(self.parser, 'Symbol handling') + self.parser.set_default('symbol_transforms', []) + group.add_option(IncompatibleOption( + '--symbol-transform', type='string', + action='callback', callback=self.callback_symbol_transform, + help=( + 'transform symbol names from P to S, where P and S ' + 'use Python regexp and reference syntax ' + 'respectively. P must match the whole symbol name' + ), + man_help=( + 'Transform RCS/CVS symbol names before entering them into ' + 'Subversion. \\fIpattern\\fR is a Python regexp pattern that ' + 'is matches against the entire symbol name; \\fIreplacement\\fR ' + 'is a replacement using Python\'s regexp reference syntax. ' + 'You may specify any number of these options; they will be ' + 'applied in the order given on the command line.' + ), + metavar='P:S', + )) + self.parser.set_default('symbol_strategy_rules', []) + group.add_option(IncompatibleOption( + '--symbol-hints', type='string', + action='callback', callback=self.callback_symbol_hints, + help='read symbol conversion hints from PATH', + man_help=( + 'Read symbol conversion hints from \\fIpath\\fR. The format of ' + '\\fIpath\\fR is the same as the format output by ' + '\\fB--write-symbol-info\\fR, namely a text file with four ' + 'whitespace-separated columns: \\fIproject-id\\fR, ' + '\\fIsymbol\\fR, \\fIconversion\\fR, and ' + '\\fIparent-lod-name\\fR. \\fIproject-id\\fR is the numerical ' + 'ID of the project to which the symbol belongs, counting from ' + '0. \\fIproject-id\\fR can be set to \'.\' if ' + 'project-specificity is not needed. \\fIsymbol-name\\fR is the ' + 'name of the symbol being specified. \\fIconversion\\fR ' + 'specifies how the symbol should be converted, and can be one ' + 'of the values \'branch\', \'tag\', or \'exclude\'. If ' + '\\fIconversion\\fR is \'.\', then this rule does not affect ' + 'how the symbol is converted. \\fIparent-lod-name\\fR is the ' + 'name of the symbol from which this symbol should sprout, or ' + '\'.trunk.\' if the symbol should sprout from trunk. If ' + '\\fIparent-lod-name\\fR is omitted or \'.\', then this rule ' + 'does not affect the preferred parent of this symbol. The file ' + 'may contain blank lines or comment lines (lines whose first ' + 'non-whitespace character is \'#\').' + ), + metavar='PATH', + )) + self.parser.set_default('symbol_default', 'heuristic') + group.add_option(IncompatibleOption( + '--symbol-default', type='choice', + choices=['heuristic', 'strict', 'branch', 'tag'], + action='store', + help=( + 'specify how ambiguous symbols are converted. ' + 'OPT is "heuristic" (default), "strict", "branch", ' + 'or "tag"' + ), + man_help=( + 'Specify how to convert ambiguous symbols (those that appear in ' + 'the CVS archive as both branches and tags). \\fIopt\\fR must ' + 'be \'heuristic\' (decide how to treat each ambiguous symbol ' + 'based on whether it was used more often as a branch/tag in ' + 'CVS), \'strict\' (no default; every ambiguous symbol has to be ' + 'resolved manually using \\fB--force-branch\\fR, ' + '\\fB--force-tag\\fR, or \\fB--exclude\\fR), \'branch\' (treat ' + 'every ambiguous symbol as a branch), or \'tag\' (treat every ' + 'ambiguous symbol as a tag). The default is \'heuristic\'.' + ), + metavar='OPT', + )) + group.add_option(IncompatibleOption( + '--force-branch', type='string', + action='callback', callback=self.callback_force_branch, + help='force symbols matching REGEXP to be branches', + man_help=( + 'Force symbols whose names match \\fIregexp\\fR to be branches. ' + '\\fIregexp\\fR must match the whole symbol name.' + ), + metavar='REGEXP', + )) + group.add_option(IncompatibleOption( + '--force-tag', type='string', + action='callback', callback=self.callback_force_tag, + help='force symbols matching REGEXP to be tags', + man_help=( + 'Force symbols whose names match \\fIregexp\\fR to be tags. ' + '\\fIregexp\\fR must match the whole symbol name.' + ), + metavar='REGEXP', + )) + group.add_option(IncompatibleOption( + '--exclude', type='string', + action='callback', callback=self.callback_exclude, + help='exclude branches and tags matching REGEXP', + man_help=( + 'Exclude branches and tags whose names match \\fIregexp\\fR ' + 'from the conversion. \\fIregexp\\fR must match the whole ' + 'symbol name.' + ), + metavar='REGEXP', + )) + self.parser.set_default('keep_trivial_imports', False) + group.add_option(IncompatibleOption( + '--keep-trivial-imports', + action='store_true', + help=( + 'do not exclude branches that were only used for ' + 'a single import (usually these are unneeded)' + ), + man_help=( + 'Do not exclude branches that were only used for a single ' + 'import. (By default such branches are excluded because they ' + 'are usually created by the inappropriate use of \\fBcvs ' + 'import\\fR.)' + ), + )) + + return group + + def _get_subversion_properties_options_group(self): + group = OptionGroup(self.parser, 'Subversion properties') + group.add_option(ContextOption( + '--username', type='string', + action='store', + help='username for cvs2svn-synthesized commits', + man_help=( + 'Set the default username to \\fIname\\fR when cvs2svn needs ' + 'to generate a commit for which CVS does not record the ' + 'original username. This happens when a branch or tag is ' + 'created. The default is to use no author at all for such ' + 'commits.' + ), + metavar='NAME', + )) + self.parser.set_default('auto_props_files', []) + group.add_option(IncompatibleOption( + '--auto-props', type='string', + action='append', dest='auto_props_files', + help=( + 'set file properties from the auto-props section ' + 'of a file in svn config format' + ), + man_help=( + 'Specify a file in the format of Subversion\'s config file, ' + 'whose [auto-props] section can be used to set arbitrary ' + 'properties on files in the Subversion repository based on ' + 'their filenames. (The [auto-props] section header must be ' + 'present; other sections of the config file, including the ' + 'enable-auto-props setting, are ignored.) Filenames are matched ' + 'to the filename patterns case-insensitively.' + + ), + metavar='FILE', + )) + self.parser.set_default('mime_types_files', []) + group.add_option(IncompatibleOption( + '--mime-types', type='string', + action='append', dest='mime_types_files', + help=( + 'specify an apache-style mime.types file for setting ' + 'svn:mime-type' + ), + man_help=( + 'Specify an apache-style mime.types \\fIfile\\fR for setting ' + 'svn:mime-type.' + ), + metavar='FILE', + )) + self.parser.set_default('eol_from_mime_type', False) + group.add_option(IncompatibleOption( + '--eol-from-mime-type', + action='store_true', + help='set svn:eol-style from mime type if known', + man_help=( + 'For files that don\'t have the kb expansion mode but have a ' + 'known mime type, set the eol-style based on the mime type. ' + 'For such files, set svn:eol-style to "native" if the mime type ' + 'begins with "text/", and leave it unset (i.e., no EOL ' + 'translation) otherwise. Files with unknown mime types are ' + 'not affected by this option. This option has no effect ' + 'unless the \\fB--mime-types\\fR option is also specified.' + ), + )) + group.add_option(IncompatibleOption( + '--default-eol', type='choice', + choices=['binary', 'native', 'CRLF', 'LF', 'CR'], + action='store', + help=( + 'default svn:eol-style for non-binary files with ' + 'undetermined mime types. STYLE is "binary" ' + '(default), "native", "CRLF", "LF", or "CR"' + ), + man_help=( + 'Set svn:eol-style to \\fIstyle\\fR for files that don\'t have ' + 'the CVS \'kb\' expansion mode and whose end-of-line ' + 'translation mode hasn\'t been determined by one of the other ' + 'options. \\fIstyle\\fR must be \'binary\' (default), ' + '\'native\', \'CRLF\', \'LF\', or \'CR\'.' + ), + metavar='STYLE', + )) + self.parser.set_default('keywords_off', False) + group.add_option(IncompatibleOption( + '--keywords-off', + action='store_true', + help=( + 'don\'t set svn:keywords on any files (by default, ' + 'cvs2svn sets svn:keywords on non-binary files to "%s")' + % (config.SVN_KEYWORDS_VALUE,) + ), + man_help=( + 'By default, cvs2svn sets svn:keywords on CVS files to "author ' + 'id date" if the mode of the RCS file in question is either kv, ' + 'kvl or unset. If you use the --keywords-off switch, cvs2svn ' + 'will not set svn:keywords for any file. While this will not ' + 'touch the keywords in the contents of your files, Subversion ' + 'will not expand them.' + ), + )) + group.add_option(ContextOption( + '--keep-cvsignore', + action='store_true', + help=( + 'keep .cvsignore files (in addition to creating ' + 'the analogous svn:ignore properties)' + ), + man_help=( + 'Include \\fI.cvsignore\\fR files in the output. (Normally ' + 'they are unneeded because cvs2svn sets the corresponding ' + '\\fIsvn:ignore\\fR properties.)' + ), + )) + group.add_option(IncompatibleOption( + '--cvs-revnums', + action='callback', callback=self.callback_cvs_revnums, + help='record CVS revision numbers as file properties', + man_help=( + 'Record CVS revision numbers as file properties in the ' + 'Subversion repository. (Note that unless it is removed ' + 'explicitly, the last CVS revision number will remain ' + 'associated with the file even after the file is changed ' + 'within Subversion.)' + ), + )) + + # Deprecated options: + group.add_option(IncompatibleOption( + '--no-default-eol', + action='store_const', dest='default_eol', const=None, + help=optparse.SUPPRESS_HELP, + man_help=optparse.SUPPRESS_HELP, + )) + self.parser.set_default('auto_props_ignore_case', True) + # True is the default now, so this option has no effect: + group.add_option(IncompatibleOption( + '--auto-props-ignore-case', + action='store_true', + help=optparse.SUPPRESS_HELP, + man_help=optparse.SUPPRESS_HELP, + )) + + return group + + def _get_extraction_options_group(self): + group = OptionGroup(self.parser, 'Extraction options') + + return group + + def _get_environment_options_group(self): + group = OptionGroup(self.parser, 'Environment options') + group.add_option(ContextOption( + '--tmpdir', type='string', + action='store', + help=( + 'directory to use for temporary data files ' + '(default "cvs2svn-tmp")' + ), + man_help=( + 'Set the \\fIpath\\fR to use for temporary data. Default ' + 'is a directory called \\fIcvs2svn-tmp\\fR under the current ' + 'directory.' + ), + metavar='PATH', + )) + self.parser.set_default('co_executable', config.CO_EXECUTABLE) + group.add_option(IncompatibleOption( + '--co', type='string', + action='store', dest='co_executable', + help='path to the "co" program (required if --use-rcs)', + man_help=( + 'Path to the \\fIco\\fR program. (\\fIco\\fR is needed if the ' + '\\fB--use-rcs\\fR option is used.)' + ), + metavar='PATH', + )) + self.parser.set_default('cvs_executable', config.CVS_EXECUTABLE) + group.add_option(IncompatibleOption( + '--cvs', type='string', + action='store', dest='cvs_executable', + help='path to the "cvs" program (required if --use-cvs)', + man_help=( + 'Path to the \\fIcvs\\fR program. (\\fIcvs\\fR is needed if the ' + '\\fB--use-cvs\\fR option is used.)' + ), + metavar='PATH', + )) + group.add_option(ContextOption( + '--sort', type='string', + action='store', dest='sort_executable', + compatible_with_option=True, + help='path to the GNU "sort" program', + man_help=( + 'Path to the GNU \\fIsort\\fR program. (cvs2svn requires GNU ' + 'sort.)' + ), + metavar='PATH', + )) + + return group + + def _get_partial_conversion_options_group(self): + group = OptionGroup(self.parser, 'Partial conversions') + group.add_option(ManOption( + '--pass', type='string', + action='callback', callback=self.callback_passes, + help='execute only specified PASS of conversion', + man_help=( + 'Execute only pass \\fIpass\\fR of the conversion. ' + '\\fIpass\\fR can be specified by name or by number (see ' + '\\fB--help-passes\\fR).' + ), + metavar='PASS', + )) + group.add_option(ManOption( + '--passes', '-p', type='string', + action='callback', callback=self.callback_passes, + help=( + 'execute passes START through END, inclusive (PASS, ' + 'START, and END can be pass names or numbers)' + ), + man_help=( + 'Execute passes \\fIstart\\fR through \\fIend\\fR of the ' + 'conversion (inclusive). \\fIstart\\fR and \\fIend\\fR can be ' + 'specified by name or by number (see \\fB--help-passes\\fR). ' + 'If \\fIstart\\fR or \\fIend\\fR is missing, it defaults to ' + 'the first or last pass, respectively. For this to work the ' + 'earlier passes must have been completed before on the ' + 'same CVS repository, and the generated data files must be ' + 'in the temporary directory (see \\fB--tmpdir\\fR).' + ), + metavar='[START]:[END]', + )) + + return group + + def _get_information_options_group(self): + group = OptionGroup(self.parser, 'Information options') + group.add_option(ManOption( + '--version', + action='callback', callback=self.callback_version, + help='print the version number', + man_help='Print the version number.', + )) + group.add_option(ManOption( + '--help', '-h', + action="help", + help='print this usage message and exit with success', + man_help='Print the usage message and exit with success.', + )) + group.add_option(ManOption( + '--help-passes', + action='callback', callback=self.callback_help_passes, + help='list the available passes and their numbers', + man_help=( + 'Print the numbers and names of the conversion passes and ' + 'exit with success.' + ), + )) + group.add_option(ManOption( + '--man', + action='callback', callback=self.callback_manpage, + help='write the manpage for this program to standard output', + man_help=( + 'Output the unix-style manpage for this program to standard ' + 'output.' + ), + )) + group.add_option(ManOption( + '--verbose', '-v', + action='callback', callback=self.callback_verbose, + help='verbose (may be specified twice for debug output)', + man_help=( + 'Print more information while running. This option may be ' + 'specified twice to output voluminous debugging information.' + ), + )) + group.add_option(ManOption( + '--quiet', '-q', + action='callback', callback=self.callback_quiet, + help='quiet (may be specified twice for very quiet)', + man_help=( + 'Print less information while running. This option may be ' + 'specified twice to suppress all non-error output.' + ), + )) + group.add_option(ContextOption( + '--write-symbol-info', type='string', + action='store', dest='symbol_info_filename', + help='write information and statistics about CVS symbols to PATH.', + man_help=( + 'Write to \\fIpath\\fR symbol statistics and information about ' + 'how symbols were converted during CollateSymbolsPass.' + ), + metavar='PATH', + )) + group.add_option(ContextOption( + '--skip-cleanup', + action='store_true', + help='prevent the deletion of intermediate files', + man_help='Prevent the deletion of temporary files.', + )) + group.add_option(ManOption( + '--profile', + action='callback', callback=self.callback_profile, + help='profile with \'hotshot\' (into file cvs2svn.hotshot)', + man_help=( + 'Profile with \'hotshot\' (into file \\fIcvs2svn.hotshot\\fR).' + ), + )) + + return group + + def callback_options(self, option, opt_str, value, parser): + parser.values.options_file_found = True + self.process_options_file(value) + + def callback_encoding(self, option, opt_str, value, parser): + ctx = Ctx() + + try: + ctx.cvs_author_decoder.add_encoding(value) + ctx.cvs_log_decoder.add_encoding(value) + ctx.cvs_filename_decoder.add_encoding(value) + except LookupError, e: + raise FatalError(str(e)) + + def callback_fallback_encoding(self, option, opt_str, value, parser): + ctx = Ctx() + + try: + ctx.cvs_author_decoder.set_fallback_encoding(value) + ctx.cvs_log_decoder.set_fallback_encoding(value) + # Don't use fallback_encoding for filenames. + except LookupError, e: + raise FatalError(str(e)) + + def callback_help_passes(self, option, opt_str, value, parser): + self.pass_manager.help_passes() + sys.exit(0) + + def callback_manpage(self, option, opt_str, value, parser): + raise NotImplementedError() + + def callback_version(self, option, opt_str, value, parser): + sys.stdout.write( + '%s version %s\n' % (self.progname, VERSION) + ) + sys.exit(0) + + def callback_verbose(self, option, opt_str, value, parser): + Log().increase_verbosity() + + def callback_quiet(self, option, opt_str, value, parser): + Log().decrease_verbosity() + + def callback_passes(self, option, opt_str, value, parser): + if value.find(':') >= 0: + start_pass, end_pass = value.split(':') + self.start_pass = self.pass_manager.get_pass_number(start_pass, 1) + self.end_pass = self.pass_manager.get_pass_number( + end_pass, self.pass_manager.num_passes + ) + else: + self.end_pass = \ + self.start_pass = \ + self.pass_manager.get_pass_number(value) + + def callback_profile(self, option, opt_str, value, parser): + self.profiling = True + + def callback_symbol_hints(self, option, opt_str, value, parser): + parser.values.symbol_strategy_rules.append(SymbolHintsFileRule(value)) + + def callback_force_branch(self, option, opt_str, value, parser): + parser.values.symbol_strategy_rules.append( + ForceBranchRegexpStrategyRule(value) + ) + + def callback_force_tag(self, option, opt_str, value, parser): + parser.values.symbol_strategy_rules.append( + ForceTagRegexpStrategyRule(value) + ) + + def callback_exclude(self, option, opt_str, value, parser): + parser.values.symbol_strategy_rules.append( + ExcludeRegexpStrategyRule(value) + ) + + def callback_cvs_revnums(self, option, opt_str, value, parser): + Ctx().svn_property_setters.append(CVSRevisionNumberSetter()) + + def callback_symbol_transform(self, option, opt_str, value, parser): + [pattern, replacement] = value.split(":") + try: + parser.values.symbol_transforms.append( + RegexpSymbolTransform(pattern, replacement) + ) + except re.error: + raise FatalError("'%s' is not a valid regexp." % (pattern,)) + + def process_symbol_strategy_options(self): + """Process symbol strategy-related options.""" + + ctx = Ctx() + options = self.options + + # Add the standard symbol name cleanup rules: + self.options.symbol_transforms.extend([ + ReplaceSubstringsSymbolTransform('\\','/'), + # Remove leading, trailing, and repeated slashes: + NormalizePathsSymbolTransform(), + ]) + + if ctx.trunk_only: + if options.symbol_strategy_rules or options.keep_trivial_imports: + raise SymbolOptionsWithTrunkOnlyException() + + else: + if not options.keep_trivial_imports: + options.symbol_strategy_rules.append(ExcludeTrivialImportBranchRule()) + + options.symbol_strategy_rules.append(UnambiguousUsageRule()) + if options.symbol_default == 'strict': + pass + elif options.symbol_default == 'branch': + options.symbol_strategy_rules.append(AllBranchRule()) + elif options.symbol_default == 'tag': + options.symbol_strategy_rules.append(AllTagRule()) + elif options.symbol_default == 'heuristic': + options.symbol_strategy_rules.append(BranchIfCommitsRule()) + options.symbol_strategy_rules.append(HeuristicStrategyRule()) + else: + assert False + + # Now add a rule whose job it is to pick the preferred parents of + # branches and tags: + options.symbol_strategy_rules.append(HeuristicPreferredParentRule()) + + def process_property_setter_options(self): + """Process the options that set SVN properties.""" + + ctx = Ctx() + options = self.options + + for value in options.auto_props_files: + ctx.svn_property_setters.append( + AutoPropsPropertySetter(value, options.auto_props_ignore_case) + ) + + for value in options.mime_types_files: + ctx.svn_property_setters.append(MimeMapper(value)) + + ctx.svn_property_setters.append(CVSBinaryFileEOLStyleSetter()) + + ctx.svn_property_setters.append(CVSBinaryFileDefaultMimeTypeSetter()) + + if options.eol_from_mime_type: + ctx.svn_property_setters.append(EOLStyleFromMimeTypeSetter()) + + ctx.svn_property_setters.append( + DefaultEOLStyleSetter(options.default_eol) + ) + + ctx.svn_property_setters.append(SVNBinaryFileKeywordsPropertySetter()) + + if not options.keywords_off: + ctx.svn_property_setters.append( + KeywordsPropertySetter(config.SVN_KEYWORDS_VALUE)) + + ctx.svn_property_setters.append(ExecutablePropertySetter()) + + def process_options(self): + """Do the main configuration based on command-line options. + + This method is only called if the --options option was not + specified.""" + + raise NotImplementedError() + + def check_options(self): + """Check the the run options are OK. + + This should only be called after all options have been processed.""" + + # Convenience var, so we don't have to keep instantiating this Borg. + ctx = Ctx() + + if not self.start_pass <= self.end_pass: + raise InvalidPassError( + 'Ending pass must not come before starting pass.') + + if not ctx.dry_run and ctx.output_option is None: + raise FatalError('No output option specified.') + + if ctx.output_option is not None: + ctx.output_option.check() + + if not self.projects: + raise FatalError('No project specified.') + + def verify_option_compatibility(self): + """Verify that no options incompatible with --options were used. + + The --options option was specified. Verify that no incompatible + options or arguments were specified.""" + + if self.options.options_incompatible_options or self.args: + if self.options.options_incompatible_options: + oio = self.options.options_incompatible_options + Log().error( + '%s: The following options cannot be used in combination with ' + 'the --options\n' + 'option:\n' + ' %s\n' + % (error_prefix, '\n '.join(oio)) + ) + if self.args: + Log().error( + '%s: No cvs-repos-path arguments are allowed with the --options ' + 'option.\n' + % (error_prefix,) + ) + sys.exit(1) + + def process_options_file(self, options_filename): + """Read options from the file named OPTIONS_FILENAME. + + Store the run options to SELF.""" + + g = { + 'ctx' : Ctx(), + 'run_options' : self, + } + execfile(options_filename, g) + + def usage(self): + self.parser.print_help() + + diff --git a/cvs2svn_lib/serializer.py b/cvs2svn_lib/serializer.py new file mode 100644 index 0000000..24bd81c --- /dev/null +++ b/cvs2svn_lib/serializer.py @@ -0,0 +1,146 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Picklers and unpicklers that are primed with known objects.""" + + +import cStringIO +import marshal +import cPickle +import zlib + + +class Serializer: + """An object able to serialize/deserialize some class of objects.""" + + def dumpf(self, f, object): + """Serialize OBJECT to file-like object F.""" + + raise NotImplementedError() + + def dumps(self, object): + """Return a string containing OBJECT in serialized form.""" + + raise NotImplementedError() + + def loadf(self, f): + """Return the next object deserialized from file-like object F.""" + + raise NotImplementedError() + + def loads(self, s): + """Return the object deserialized from string S.""" + + raise NotImplementedError() + + +class MarshalSerializer(Serializer): + """This class uses the marshal module to serialize/deserialize. + + This means that it shares the limitations of the marshal module, + namely only being able to serialize a few simple python data types + without reference loops.""" + + def dumpf(self, f, object): + marshal.dump(object, f) + + def dumps(self, object): + return marshal.dumps(object) + + def loadf(self, f): + return marshal.load(f) + + def loads(self, s): + return marshal.loads(s) + + +class PrimedPickleSerializer(Serializer): + """This class acts as a pickler/unpickler with a pre-initialized memo. + + The picklers and unpicklers are 'pre-trained' to recognize the + objects that are in the primer. If objects are recognized + from PRIMER, then only their persistent IDs need to be pickled + instead of the whole object. (Note that the memos needed for + pickling and unpickling are different.) + + A new pickler/unpickler is created for each use, each time with the + memo initialized appropriately for pickling or unpickling.""" + + def __init__(self, primer): + """Prepare to make picklers/unpicklers with the specified primer. + + The Pickler and Unpickler are 'primed' by pre-pickling PRIMER, + which can be an arbitrary object (e.g., a list of objects that are + expected to occur frequently in the objects to be serialized).""" + + f = cStringIO.StringIO() + pickler = cPickle.Pickler(f, -1) + pickler.dump(primer) + self.pickler_memo = pickler.memo + + unpickler = cPickle.Unpickler(cStringIO.StringIO(f.getvalue())) + unpickler.load() + self.unpickler_memo = unpickler.memo + + def dumpf(self, f, object): + """Serialize OBJECT to file-like object F.""" + + pickler = cPickle.Pickler(f, -1) + pickler.memo = self.pickler_memo.copy() + pickler.dump(object) + + def dumps(self, object): + """Return a string containing OBJECT in serialized form.""" + + f = cStringIO.StringIO() + self.dumpf(f, object) + return f.getvalue() + + def loadf(self, f): + """Return the next object deserialized from file-like object F.""" + + unpickler = cPickle.Unpickler(f) + unpickler.memo = self.unpickler_memo.copy() + return unpickler.load() + + def loads(self, s): + """Return the object deserialized from string S.""" + + return self.loadf(cStringIO.StringIO(s)) + + +class CompressingSerializer(Serializer): + """This class wraps other Serializers to compress their serialized data.""" + + def __init__(self, wrapee): + """Constructor. WRAPEE is the Serializer whose bitstream ought to be + compressed.""" + + self.wrapee = wrapee + + def dumpf(self, f, object): + marshal.dump(zlib.compress(self.wrapee.dumps(object), 9), f) + + def dumps(self, object): + return marshal.dumps(zlib.compress(self.wrapee.dumps(object), 9)) + + def loadf(self, f): + return self.wrapee.loads(zlib.decompress(marshal.load(f))) + + def loads(self, s): + return self.wrapee.loads(zlib.decompress(marshal.loads(s))) + + diff --git a/cvs2svn_lib/stats_keeper.py b/cvs2svn_lib/stats_keeper.py new file mode 100644 index 0000000..1a82540 --- /dev/null +++ b/cvs2svn_lib/stats_keeper.py @@ -0,0 +1,189 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the StatsKeeper class. + +A StatsKeeper can pickle itself to a STATISTICS_FILE. This module +also includes a function to read a StatsKeeper from a STATISTICS_FILE.""" + + +import time +import cPickle +from cStringIO import StringIO + +from cvs2svn_lib.cvs_item import CVSRevision +from cvs2svn_lib.cvs_item import CVSBranch +from cvs2svn_lib.cvs_item import CVSTag + + +class StatsKeeper: + def __init__(self): + self._svn_rev_count = None + self._first_rev_date = 1L<<32 + self._last_rev_date = 0 + self._pass_timings = { } + self._stats_reflect_exclude = False + self.reset_cvs_rev_info() + + def log_duration_for_pass(self, duration, pass_num, pass_name): + self._pass_timings[pass_num] = (pass_name, duration,) + + def set_stats_reflect_exclude(self, value): + self._stats_reflect_exclude = value + + def reset_cvs_rev_info(self): + self._repos_file_count = 0 + self._repos_size = 0 + self._cvs_revs_count = 0 + self._cvs_branches_count = 0 + self._cvs_tags_count = 0 + + # A set of tag_ids seen: + self._tag_ids = set() + + # A set of branch_ids seen: + self._branch_ids = set() + + def record_cvs_file(self, cvs_file): + self._repos_file_count += 1 + self._repos_size += cvs_file.file_size + + def _record_cvs_rev(self, cvs_rev): + self._cvs_revs_count += 1 + + if cvs_rev.timestamp < self._first_rev_date: + self._first_rev_date = cvs_rev.timestamp + + if cvs_rev.timestamp > self._last_rev_date: + self._last_rev_date = cvs_rev.timestamp + + def _record_cvs_branch(self, cvs_branch): + self._cvs_branches_count += 1 + self._branch_ids.add(cvs_branch.symbol.id) + + def _record_cvs_tag(self, cvs_tag): + self._cvs_tags_count += 1 + self._tag_ids.add(cvs_tag.symbol.id) + + def record_cvs_item(self, cvs_item): + if isinstance(cvs_item, CVSRevision): + self._record_cvs_rev(cvs_item) + elif isinstance(cvs_item, CVSBranch): + self._record_cvs_branch(cvs_item) + elif isinstance(cvs_item, CVSTag): + self._record_cvs_tag(cvs_item) + else: + raise RuntimeError('Unknown CVSItem type') + + def set_svn_rev_count(self, count): + self._svn_rev_count = count + + def svn_rev_count(self): + return self._svn_rev_count + + def __getstate__(self): + state = self.__dict__.copy() + # This can get kinda large, so we don't store it: + return state + + def archive(self, filename): + f = open(filename, 'wb') + cPickle.dump(self, f) + f.close() + + def __str__(self): + f = StringIO() + f.write('\n') + f.write('cvs2svn Statistics:\n') + f.write('------------------\n') + f.write('Total CVS Files: %10i\n' % (self._repos_file_count,)) + f.write('Total CVS Revisions: %10i\n' % (self._cvs_revs_count,)) + f.write('Total CVS Branches: %10i\n' % (self._cvs_branches_count,)) + f.write('Total CVS Tags: %10i\n' % (self._cvs_tags_count,)) + f.write('Total Unique Tags: %10i\n' % (len(self._tag_ids),)) + f.write('Total Unique Branches: %10i\n' % (len(self._branch_ids),)) + f.write('CVS Repos Size in KB: %10i\n' % ((self._repos_size / 1024),)) + + if self._svn_rev_count is not None: + f.write('Total SVN Commits: %10i\n' % self._svn_rev_count) + + f.write( + 'First Revision Date: %s\n' % (time.ctime(self._first_rev_date),) + ) + f.write( + 'Last Revision Date: %s\n' % (time.ctime(self._last_rev_date),) + ) + f.write('------------------') + + if not self._stats_reflect_exclude: + f.write( + '\n' + '(These are unaltered CVS repository stats and do not\n' + ' reflect tags or branches excluded via --exclude)\n' + ) + + return f.getvalue() + + @staticmethod + def _get_timing_format(value): + # Output times with up to 3 decimal places: + decimals = max(0, 4 - len('%d' % int(value))) + length = len(('%%.%df' % decimals) % value) + return '%%%d.%df' % (length, decimals,) + + def single_pass_timing(self, pass_num): + (pass_name, duration,) = self._pass_timings[pass_num] + format = self._get_timing_format(duration) + time_string = format % (duration,) + return ( + 'Time for pass%d (%s): %s seconds.' + % (pass_num, pass_name, time_string,) + ) + + def timings(self): + passes = self._pass_timings.keys() + passes.sort() + f = StringIO() + f.write('Timings (seconds):\n') + f.write('------------------\n') + + total = 0.0 + for pass_num in passes: + (pass_name, duration,) = self._pass_timings[pass_num] + total += duration + + format = self._get_timing_format(total) + + for pass_num in passes: + (pass_name, duration,) = self._pass_timings[pass_num] + f.write( + (format + ' pass%-2d %s\n') % (duration, pass_num, pass_name,) + ) + + f.write((format + ' total') % total) + return f.getvalue() + + +def read_stats_keeper(filename): + """Factory function: Return a _StatsKeeper instance. + + Read the instance from FILENAME as written by StatsKeeper.archive().""" + + f = open(filename, 'rb') + retval = cPickle.load(f) + f.close() + return retval + diff --git a/cvs2svn_lib/stdout_delegate.py b/cvs2svn_lib/stdout_delegate.py new file mode 100644 index 0000000..2b4e228 --- /dev/null +++ b/cvs2svn_lib/stdout_delegate.py @@ -0,0 +1,107 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains database facilities used by cvs2svn.""" + + +from cvs2svn_lib.log import Log +from cvs2svn_lib.svn_repository_delegate import SVNRepositoryDelegate + + +class StdoutDelegate(SVNRepositoryDelegate): + """Makes no changes to the disk, but writes out information to + STDOUT about what is happening in the SVN output. Of course, our + print statements will state that we're doing something, when in + reality, we aren't doing anything other than printing out that we're + doing something. Kind of zen, really.""" + + def __init__(self, total_revs): + self.total_revs = total_revs + + def start_commit(self, revnum, revprops): + """Prints out the Subversion revision number of the commit that is + being started.""" + + Log().verbose("=" * 60) + Log().normal("Starting Subversion r%d / %d" % (revnum, self.total_revs)) + + def end_commit(self): + pass + + def initialize_project(self, project): + Log().verbose(" Initializing project %s" % (project,)) + + def initialize_lod(self, lod): + Log().verbose(" Initializing %s" % (lod,)) + + def mkdir(self, lod, cvs_directory): + Log().verbose( + " New Directory %s" % (lod.get_path(cvs_directory.cvs_path),) + ) + + def add_path(self, s_item): + """Print a line stating what path we are 'adding'.""" + + Log().verbose(" Adding %s" % (s_item.cvs_rev.get_svn_path(),)) + + def change_path(self, s_item): + """Print a line stating what path we are 'changing'.""" + + Log().verbose(" Changing %s" % (s_item.cvs_rev.get_svn_path(),)) + + def delete_lod(self, lod): + """Print a line stating that we are 'deleting' LOD.""" + + Log().verbose(" Deleting %s" % (lod.get_path(),)) + + def delete_path(self, lod, cvs_path): + """Print a line stating that we are 'deleting' PATH.""" + + Log().verbose(" Deleting %s" % (lod.get_path(cvs_path.cvs_path),)) + + def _show_copy(self, src_path, dest_path, src_revnum): + """Print a line stating that we are 'copying' revision SRC_REVNUM + of SRC_PATH to DEST_PATH.""" + + Log().verbose( + " Copying revision %d of %s\n" + " to %s\n" + % (src_revnum, src_path, dest_path,) + ) + + def copy_lod(self, src_lod, dest_lod, src_revnum): + """Print a line stating that we are 'copying' revision SRC_REVNUM + of SRC_PATH to DEST_PATH.""" + + self._show_copy(src_lod.get_path(), dest_lod.get_path(), src_revnum) + + def copy_path(self, cvs_path, src_lod, dest_lod, src_revnum): + """Print a line stating that we are 'copying' revision SRC_REVNUM + of CVS_PATH from SRC_LOD to DEST_LOD.""" + + self._show_copy( + src_lod.get_path(cvs_path.cvs_path), + dest_lod.get_path(cvs_path.cvs_path), + src_revnum, + ) + + def finish(self): + """State that we are done creating our repository.""" + + Log().verbose("Finished creating Subversion repository.") + Log().quiet("Done.") + + diff --git a/cvs2svn_lib/svn_commit.py b/cvs2svn_lib/svn_commit.py new file mode 100644 index 0000000..25dc38e --- /dev/null +++ b/cvs2svn_lib/svn_commit.py @@ -0,0 +1,381 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the SVNCommit classes. + +There are five types of SVNCommits: + + SVNInitialProjectCommit -- Initializes a project (creates its trunk, + branches, and tags directories). + + SVNPrimaryCommit -- Commits one or more CVSRevisions on one or more + lines of development. + + SVNBranchCommit -- Creates or fills a branch; that is, copies files + from a source line of development to a target branch. + + SVNTagCommit -- Creates or fills a tag; that is, copies files from a + source line of development to a target tag. + + SVNPostCommit -- Updates trunk to reflect changes on a non-trunk + default branch. + +""" + + +import textwrap + +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag + + +class SVNCommit: + """This represents one commit to the Subversion Repository.""" + + # textwrap.TextWrapper instance to be used for wrapping log messages: + text_wrapper = textwrap.TextWrapper(width=76) + + def __init__(self, date, revnum): + """Instantiate an SVNCommit. + + REVNUM is the SVN revision number of this commit.""" + + # The date of the commit, as an integer. While the SVNCommit is + # being built up, this contains the latest date seen so far. This + # member is set externally. + self.date = date + + # The SVN revision number of this commit, as an integer. + self.revnum = revnum + + def __getstate__(self): + return (self.date, self.revnum,) + + def __setstate__(self, state): + (self.date, self.revnum,) = state + + def get_cvs_items(self): + """Return a list containing the CVSItems in this commit.""" + + raise NotImplementedError() + + def get_author(self): + """Return the author or this commit, or None if none is to be used. + + The return value is exactly as the author appeared in the RCS + file, with undefined character encoding.""" + + raise NotImplementedError() + + def get_log_msg(self): + """Return a log message for this commit. + + The return value is exactly as the log message appeared in the RCS + file, with undefined character encoding.""" + + raise NotImplementedError() + + def get_warning_summary(self): + """Return a summary of this commit that can be used in warnings.""" + + return '(subversion rev %s)' % (self.revnum,) + + def get_description(self): + """Return a partial description of this SVNCommit, for logging.""" + + raise NotImplementedError() + + def output(self, output_option): + """Cause this commit to be output to OUTPUT_OPTION. + + This method is used for double-dispatch. Derived classes should + call the OutputOption.process_*_commit() method appropriate for + the type of SVNCommit.""" + + raise NotImplementedError() + + def __str__(self): + """ Print a human-readable description of this SVNCommit. + + This description is not intended to be machine-parseable.""" + + ret = "SVNCommit #: " + str(self.revnum) + "\n" + ret += " debug description: " + self.get_description() + "\n" + return ret + + +class SVNInitialProjectCommit(SVNCommit): + def __init__(self, date, projects, revnum): + SVNCommit.__init__(self, date, revnum) + self.projects = list(projects) + + def __getstate__(self): + return ( + SVNCommit.__getstate__(self), + [project.id for project in self.projects], + ) + + def __setstate__(self, state): + (svn_commit_state, project_ids,) = state + SVNCommit.__setstate__(self, svn_commit_state) + self.projects = [ + Ctx()._projects[project_id] for project_id in project_ids + ] + + def get_cvs_items(self): + return [] + + def get_author(self): + return Ctx().username + + def get_log_msg(self): + return self.text_wrapper.fill( + Ctx().initial_project_commit_message % {} + ) + + def get_description(self): + return 'Project initialization' + + def output(self, output_option): + output_option.process_initial_project_commit(self) + + +class SVNRevisionCommit(SVNCommit): + """A SVNCommit that includes actual CVS revisions.""" + + def __init__(self, cvs_revs, date, revnum): + SVNCommit.__init__(self, date, revnum) + + self.cvs_revs = list(cvs_revs) + + # This value is set lazily by _get_metadata(): + self._metadata = None + + def __getstate__(self): + """Return the part of the state represented by this mixin.""" + + return ( + SVNCommit.__getstate__(self), + [cvs_rev.id for cvs_rev in self.cvs_revs], + ) + + def __setstate__(self, state): + """Restore the part of the state represented by this mixin.""" + + (svn_commit_state, cvs_rev_ids) = state + SVNCommit.__setstate__(self, svn_commit_state) + + self.cvs_revs = [ + cvs_rev + for (id, cvs_rev) in Ctx()._cvs_items_db.get_many(cvs_rev_ids) + ] + self._metadata = None + + def get_cvs_items(self): + return self.cvs_revs + + def _get_metadata(self): + """Return the Metadata instance for this commit.""" + + if self._metadata is None: + # Set self._metadata for this commit from that of the first cvs + # revision. + if not self.cvs_revs: + raise InternalError('SVNPrimaryCommit contains no CVS revisions') + + metadata_id = self.cvs_revs[0].metadata_id + self._metadata = Ctx()._metadata_db[metadata_id] + + return self._metadata + + def get_author(self): + return self._get_metadata().author + + def get_warning_summary(self): + retval = [] + retval.append(SVNCommit.get_warning_summary(self) + ' Related files:') + for cvs_rev in self.cvs_revs: + retval.append(' ' + cvs_rev.cvs_file.filename) + return '\n'.join(retval) + + def __str__(self): + """Return the revision part of a description of this SVNCommit. + + Derived classes should append the output of this method to the + output of SVNCommit.__str__().""" + + ret = [] + ret.append(SVNCommit.__str__(self)) + ret.append(' cvs_revs:\n') + for cvs_rev in self.cvs_revs: + ret.append(' %x\n' % (cvs_rev.id,)) + return ''.join(ret) + + +class SVNPrimaryCommit(SVNRevisionCommit): + def __init__(self, cvs_revs, date, revnum): + SVNRevisionCommit.__init__(self, cvs_revs, date, revnum) + + def get_log_msg(self): + """Return the actual log message for this commit.""" + + return self._get_metadata().log_msg + + def get_description(self): + return 'commit' + + def output(self, output_option): + output_option.process_primary_commit(self) + + +class SVNPostCommit(SVNRevisionCommit): + def __init__(self, motivating_revnum, cvs_revs, date, revnum): + SVNRevisionCommit.__init__(self, cvs_revs, date, revnum) + + # The subversion revision number of the *primary* commit where the + # default branch changes actually happened. (NOTE: Secondary + # commits that fill branches and tags also have a motivating + # commit, but we do not record it because it is (currently) not + # needed for anything.) motivating_revnum is used when generating + # the log message for the commit that synchronizes the default + # branch with trunk. + # + # It is possible for multiple synchronization commits to refer to + # the same motivating commit revision number, and it is possible + # for a single synchronization commit to contain CVSRevisions on + # multiple different default branches. + self.motivating_revnum = motivating_revnum + + def __getstate__(self): + return ( + SVNRevisionCommit.__getstate__(self), + self.motivating_revnum, + ) + + def __setstate__(self, state): + (rev_state, self.motivating_revnum,) = state + SVNRevisionCommit.__setstate__(self, rev_state) + + def get_cvs_items(self): + # It might seem that we should return + # SVNRevisionCommit.get_cvs_items(self) here, but this commit + # doesn't really include those CVSItems, but rather followup + # commits to those. + return [] + + def get_log_msg(self): + """Return a manufactured log message for this commit.""" + + return self.text_wrapper.fill( + Ctx().post_commit_message % {'revnum' : self.motivating_revnum} + ) + + def get_description(self): + return 'post-commit default branch(es)' + + def output(self, output_option): + output_option.process_post_commit(self) + + +class SVNSymbolCommit(SVNCommit): + def __init__(self, symbol, cvs_symbol_ids, date, revnum): + SVNCommit.__init__(self, date, revnum) + + # The TypedSymbol that is filled in this SVNCommit. + self.symbol = symbol + + self.cvs_symbol_ids = cvs_symbol_ids + + def __getstate__(self): + return ( + SVNCommit.__getstate__(self), + self.symbol.id, self.cvs_symbol_ids, + ) + + def __setstate__(self, state): + (svn_commit_state, symbol_id, self.cvs_symbol_ids) = state + SVNCommit.__setstate__(self, svn_commit_state) + self.symbol = Ctx()._symbol_db.get_symbol(symbol_id) + + def get_cvs_items(self): + return [ + cvs_symbol + for (id, cvs_symbol) + in Ctx()._cvs_items_db.get_many(self.cvs_symbol_ids) + ] + + def _get_symbol_type(self): + """Return the type of the self.symbol ('branch' or 'tag').""" + + raise NotImplementedError() + + def get_author(self): + return Ctx().username + + def get_log_msg(self): + """Return a manufactured log message for this commit.""" + + return self.text_wrapper.fill( + Ctx().symbol_commit_message % { + 'symbol_type' : self._get_symbol_type(), + 'symbol_name' : self.symbol.name, + } + ) + + def get_description(self): + return 'copying to %s %r' % (self._get_symbol_type(), self.symbol.name,) + + def __str__(self): + """ Print a human-readable description of this SVNCommit. + + This description is not intended to be machine-parseable.""" + + return ( + SVNCommit.__str__(self) + + " symbolic name: %s\n" % (self.symbol.name,) + ) + + +class SVNBranchCommit(SVNSymbolCommit): + def __init__(self, symbol, cvs_symbol_ids, date, revnum): + if not isinstance(symbol, Branch): + raise InternalError('Incorrect symbol type %r' % (symbol,)) + + SVNSymbolCommit.__init__(self, symbol, cvs_symbol_ids, date, revnum) + + def _get_symbol_type(self): + return 'branch' + + def output(self, output_option): + output_option.process_branch_commit(self) + + +class SVNTagCommit(SVNSymbolCommit): + def __init__(self, symbol, cvs_symbol_ids, date, revnum): + if not isinstance(symbol, Tag): + raise InternalError('Incorrect symbol type %r' % (symbol,)) + + SVNSymbolCommit.__init__(self, symbol, cvs_symbol_ids, date, revnum) + + def _get_symbol_type(self): + return 'tag' + + def output(self, output_option): + output_option.process_tag_commit(self) + + diff --git a/cvs2svn_lib/svn_commit_creator.py b/cvs2svn_lib/svn_commit_creator.py new file mode 100644 index 0000000..c87db38 --- /dev/null +++ b/cvs2svn_lib/svn_commit_creator.py @@ -0,0 +1,217 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the SVNCommitCreator class.""" + + +import time + +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.cvs_item import CVSRevisionNoop +from cvs2svn_lib.cvs_item import CVSBranchNoop +from cvs2svn_lib.cvs_item import CVSTagNoop +from cvs2svn_lib.changeset import OrderedChangeset +from cvs2svn_lib.changeset import BranchChangeset +from cvs2svn_lib.changeset import TagChangeset +from cvs2svn_lib.svn_commit import SVNInitialProjectCommit +from cvs2svn_lib.svn_commit import SVNPrimaryCommit +from cvs2svn_lib.svn_commit import SVNPostCommit +from cvs2svn_lib.svn_commit import SVNBranchCommit +from cvs2svn_lib.svn_commit import SVNTagCommit +from cvs2svn_lib.key_generator import KeyGenerator + + +class SVNCommitCreator: + """This class creates and yields SVNCommits via process_changeset().""" + + def __init__(self): + # The revision number to assign to the next new SVNCommit. + self.revnum_generator = KeyGenerator() + + # A set containing the Projects that have already been + # initialized: + self._initialized_projects = set() + + def _post_commit(self, cvs_revs, motivating_revnum, timestamp): + """Generate any SVNCommits needed to follow CVS_REVS. + + That is, handle non-trunk default branches. A revision on a CVS + non-trunk default branch is visible in a default CVS checkout of + HEAD. So we copy such commits over to Subversion's trunk so that + checking out SVN trunk gives the same output as checking out of + CVS's default branch.""" + + cvs_revs = [ + cvs_rev + for cvs_rev in cvs_revs + if cvs_rev.ntdbr and not isinstance(cvs_rev, CVSRevisionNoop) + ] + + if cvs_revs: + cvs_revs.sort( + lambda a, b: cmp(a.cvs_file.filename, b.cvs_file.filename) + ) + # Generate an SVNCommit for all of our default branch cvs_revs. + yield SVNPostCommit( + motivating_revnum, cvs_revs, timestamp, + self.revnum_generator.gen_id(), + ) + + def _process_revision_changeset(self, changeset, timestamp): + """Process CHANGESET, using TIMESTAMP as the commit time. + + Create and yield one or more SVNCommits in the process. CHANGESET + must be an OrderedChangeset. TIMESTAMP is used as the timestamp + for any resulting SVNCommits.""" + + if not changeset.cvs_item_ids: + Log().warn('Changeset has no items: %r' % changeset) + return + + Log().verbose('-' * 60) + Log().verbose('CVS Revision grouping:') + Log().verbose(' Time: %s' % time.ctime(timestamp)) + + # Generate an SVNCommit unconditionally. Even if the only change in + # this group of CVSRevisions is a deletion of an already-deleted + # file (that is, a CVS revision in state 'dead' whose predecessor + # was also in state 'dead'), the conversion will still generate a + # Subversion revision containing the log message for the second dead + # revision, because we don't want to lose that information. + + cvs_revs = list(changeset.iter_cvs_items()) + if cvs_revs: + cvs_revs.sort(lambda a, b: cmp(a.cvs_file.filename, b.cvs_file.filename)) + svn_commit = SVNPrimaryCommit( + cvs_revs, timestamp, self.revnum_generator.gen_id() + ) + + yield svn_commit + + for cvs_rev in cvs_revs: + Ctx()._symbolings_logger.log_revision(cvs_rev, svn_commit.revnum) + + # Generate an SVNPostCommit if we have default branch revs. If + # some of the revisions in this commit happened on a non-trunk + # default branch, then those files have to be copied into trunk + # manually after being changed on the branch (because the RCS + # "default branch" appears as head, i.e., trunk, in practice). + # Unfortunately, Subversion doesn't support copies with sources + # in the current txn. All copies must be based in committed + # revisions. Therefore, we generate the copies in a new + # revision. + for svn_post_commit in self._post_commit( + cvs_revs, svn_commit.revnum, timestamp + ): + yield svn_post_commit + + def _process_tag_changeset(self, changeset, timestamp): + """Process TagChangeset CHANGESET, producing a SVNTagCommit. + + Filter out CVSTagNoops. If no CVSTags are left, don't generate a + SVNTagCommit.""" + + if Ctx().trunk_only: + raise InternalError( + 'TagChangeset encountered during a --trunk-only conversion') + + cvs_tag_ids = [ + cvs_tag.id + for cvs_tag in changeset.iter_cvs_items() + if not isinstance(cvs_tag, CVSTagNoop) + ] + if cvs_tag_ids: + yield SVNTagCommit( + changeset.symbol, cvs_tag_ids, timestamp, + self.revnum_generator.gen_id(), + ) + else: + Log().debug( + 'Omitting %r because it contains only CVSTagNoops' % (changeset,) + ) + + def _process_branch_changeset(self, changeset, timestamp): + """Process BranchChangeset CHANGESET, producing a SVNBranchCommit. + + Filter out CVSBranchNoops. If no CVSBranches are left, don't + generate a SVNBranchCommit.""" + + if Ctx().trunk_only: + raise InternalError( + 'BranchChangeset encountered during a --trunk-only conversion') + + cvs_branches = [ + cvs_branch + for cvs_branch in changeset.iter_cvs_items() + if not isinstance(cvs_branch, CVSBranchNoop) + ] + if cvs_branches: + svn_commit = SVNBranchCommit( + changeset.symbol, + [cvs_branch.id for cvs_branch in cvs_branches], + timestamp, + self.revnum_generator.gen_id(), + ) + yield svn_commit + for cvs_branch in cvs_branches: + Ctx()._symbolings_logger.log_branch_revision( + cvs_branch, svn_commit.revnum + ) + else: + Log().debug( + 'Omitting %r because it contains only CVSBranchNoops' % (changeset,) + ) + + def process_changeset(self, changeset, timestamp): + """Process CHANGESET, using TIMESTAMP for all of its entries. + + Return a generator that generates the resulting SVNCommits. + + The changesets must be fed to this function in proper dependency + order.""" + + # First create any new projects that might be opened by the + # changeset: + projects_opened = \ + changeset.get_projects_opened() - self._initialized_projects + if projects_opened: + if Ctx().cross_project_commits: + yield SVNInitialProjectCommit( + timestamp, projects_opened, self.revnum_generator.gen_id() + ) + else: + for project in projects_opened: + yield SVNInitialProjectCommit( + timestamp, [project], self.revnum_generator.gen_id() + ) + self._initialized_projects.update(projects_opened) + + if isinstance(changeset, OrderedChangeset): + for svn_commit \ + in self._process_revision_changeset(changeset, timestamp): + yield svn_commit + elif isinstance(changeset, TagChangeset): + for svn_commit in self._process_tag_changeset(changeset, timestamp): + yield svn_commit + elif isinstance(changeset, BranchChangeset): + for svn_commit in self._process_branch_changeset(changeset, timestamp): + yield svn_commit + else: + raise TypeError('Illegal changeset %r' % changeset) + + diff --git a/cvs2svn_lib/svn_commit_item.py b/cvs2svn_lib/svn_commit_item.py new file mode 100644 index 0000000..8bc9015 --- /dev/null +++ b/cvs2svn_lib/svn_commit_item.py @@ -0,0 +1,50 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains class SVNCommitItem.""" + + +from cvs2svn_lib.context import Ctx + + +class SVNCommitItem: + """A wrapper class for CVSRevision objects upon which + Subversion-related data (such as properties) may be hung.""" + + def __init__(self, cvs_rev, svn_props_changed): + """Initialize instance and record the properties for this file. + SVN_PROPS_CHANGED indicates whether the svn: properties are known + to have changed since the last revision. + + The properties are set by the SVNPropertySetters in + Ctx().svn_property_setters.""" + + self.cvs_rev = cvs_rev + # Did the svn properties change for this file (i.e., do they have + # to be written to the dumpfile?) + self.svn_props_changed = svn_props_changed + + # The properties for this item as a map { key : value }. If VALUE + # is None, the property should be left unset. + self.svn_props = { } + + for svn_property_setter in Ctx().svn_property_setters: + svn_property_setter.set_properties(self) + + def has_keywords(self): + return bool(self.svn_props.get('svn:keywords', None)) + + diff --git a/cvs2svn_lib/svn_output_option.py b/cvs2svn_lib/svn_output_option.py new file mode 100644 index 0000000..86d1ba4 --- /dev/null +++ b/cvs2svn_lib/svn_output_option.py @@ -0,0 +1,753 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Classes for outputting the converted repository to SVN.""" + + +import os + +from cvs2svn_lib import config +from cvs2svn_lib.common import InternalError +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import FatalException +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import format_date +from cvs2svn_lib.common import PathsNotDisjointException +from cvs2svn_lib.common import verify_paths_disjoint +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.process import CommandFailedException +from cvs2svn_lib.process import check_command_runs +from cvs2svn_lib.process import call_command +from cvs2svn_lib.cvs_file import CVSDirectory +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.symbol import LineOfDevelopment +from cvs2svn_lib.cvs_item import CVSRevisionAdd +from cvs2svn_lib.cvs_item import CVSRevisionChange +from cvs2svn_lib.cvs_item import CVSRevisionDelete +from cvs2svn_lib.cvs_item import CVSRevisionNoop +from cvs2svn_lib.repository_mirror import RepositoryMirror +from cvs2svn_lib.repository_mirror import PathExistsError +from cvs2svn_lib.svn_commit_item import SVNCommitItem +from cvs2svn_lib.openings_closings import SymbolingsReader +from cvs2svn_lib.fill_source import get_source_set +from cvs2svn_lib.stdout_delegate import StdoutDelegate +from cvs2svn_lib.dumpfile_delegate import DumpfileDelegate +from cvs2svn_lib.repository_delegate import RepositoryDelegate +from cvs2svn_lib.output_option import OutputOption + + +class SVNOutputOption(OutputOption): + """An OutputOption appropriate for output to Subversion.""" + + class ParentMissingError(Exception): + """The parent of a path is missing. + + Exception raised if an attempt is made to add a path to the + repository mirror but the parent's path doesn't exist in the + youngest revision of the repository.""" + + pass + + class ExpectedDirectoryError(Exception): + """A file was found where a directory was expected.""" + + pass + + def __init__(self, author_transforms=None): + self._mirror = RepositoryMirror() + + def to_utf8(s): + if isinstance(s, unicode): + return s.encode('utf8') + else: + return s + + self.author_transforms = {} + if author_transforms is not None: + for (cvsauthor, name) in author_transforms.iteritems(): + cvsauthor = to_utf8(cvsauthor) + name = to_utf8(name) + self.author_transforms[cvsauthor] = name + + def register_artifacts(self, which_pass): + # These artifacts are needed for SymbolingsReader: + artifact_manager.register_temp_file_needed( + config.SYMBOL_OPENINGS_CLOSINGS_SORTED, which_pass + ) + artifact_manager.register_temp_file_needed( + config.SYMBOL_OFFSETS_DB, which_pass + ) + + self._mirror.register_artifacts(which_pass) + Ctx().revision_reader.register_artifacts(which_pass) + + def check_symbols(self, symbol_map): + """Check that the paths of all included LODs are set and disjoint.""" + + error_found = False + + # Check that all included LODs have their base paths set, and + # collect the paths into a list: + paths = [] + for lod in symbol_map.itervalues(): + if isinstance(lod, LineOfDevelopment): + if lod.base_path is None: + Log().error('%s: No path was set for %r\n' % (error_prefix, lod,)) + error_found = True + else: + paths.append(lod.base_path) + + # Check that the SVN paths of all LODS are disjoint: + try: + verify_paths_disjoint(*paths) + except PathsNotDisjointException, e: + Log().error(str(e)) + error_found = True + + if error_found: + raise FatalException( + 'Please fix the above errors and restart CollateSymbolsPass' + ) + + def setup(self, svn_rev_count): + self._symbolings_reader = SymbolingsReader() + self._mirror.open() + self._delegates = [] + Ctx().revision_reader.start() + self.add_delegate(StdoutDelegate(svn_rev_count)) + + def _get_author(self, svn_commit): + author = svn_commit.get_author() + name = self.author_transforms.get(author, author) + return name + + def _get_revprops(self, svn_commit): + """Return the Subversion revprops for this SVNCommit.""" + + return { + 'svn:author' : self._get_author(svn_commit), + 'svn:log' : svn_commit.get_log_msg(), + 'svn:date' : format_date(svn_commit.date), + } + + def start_commit(self, revnum, revprops): + """Start a new commit.""" + + self._mirror.start_commit(revnum) + self._invoke_delegates('start_commit', revnum, revprops) + + def end_commit(self): + """Called at the end of each commit. + + This method copies the newly created nodes to the on-disk nodes + db.""" + + self._mirror.end_commit() + self._invoke_delegates('end_commit') + + def delete_lod(self, lod): + """Delete the main path for LOD from the tree. + + The path must currently exist. Silently refuse to delete trunk + paths.""" + + if isinstance(lod, Trunk): + # Never delete a Trunk path. + return + + self._mirror.get_current_lod_directory(lod).delete() + self._invoke_delegates('delete_lod', lod) + + def delete_path(self, cvs_path, lod, should_prune=False): + """Delete CVS_PATH from LOD.""" + + if cvs_path.parent_directory is None: + self.delete_lod(lod) + return + + parent_node = self._mirror.get_current_path( + cvs_path.parent_directory, lod + ) + del parent_node[cvs_path] + self._invoke_delegates('delete_path', lod, cvs_path) + + if should_prune: + while parent_node is not None and len(parent_node) == 0: + # A drawback of this code is that we issue a delete for each + # path and not just a single delete for the topmost directory + # pruned. + node = parent_node + cvs_path = node.cvs_path + if cvs_path.parent_directory is None: + parent_node = None + self.delete_lod(lod) + else: + parent_node = node.parent_mirror_dir + node.delete() + self._invoke_delegates('delete_path', lod, cvs_path) + + def initialize_project(self, project): + """Create the basic structure for PROJECT.""" + + self._invoke_delegates('initialize_project', project) + + # Don't invoke delegates. + self._mirror.add_lod(project.get_trunk()) + + def change_path(self, cvs_rev): + """Register a change in self._youngest for the CVS_REV's svn_path.""" + + # We do not have to update the nodes because our mirror is only + # concerned with the presence or absence of paths, and a file + # content change does not cause any path changes. + self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False)) + + def _mkdir_p(self, cvs_directory, lod): + """Make sure that CVS_DIRECTORY exists in LOD. + + If not, create it, calling delegates. Return the node for + CVS_DIRECTORY.""" + + try: + node = self._mirror.get_current_lod_directory(lod) + except KeyError: + node = self._mirror.add_lod(lod) + self._invoke_delegates('initialize_lod', lod) + + for sub_path in cvs_directory.get_ancestry()[1:]: + try: + node = node[sub_path] + except KeyError: + node = node.mkdir(sub_path) + self._invoke_delegates('mkdir', lod, sub_path) + if node is None: + raise self.ExpectedDirectoryError( + 'File found at \'%s\' where directory was expected.' % (sub_path,) + ) + + return node + + def add_path(self, cvs_rev): + """Add the CVS_REV's svn_path to the repository mirror. + + Create any missing intermediate paths.""" + + cvs_file = cvs_rev.cvs_file + parent_path = cvs_file.parent_directory + lod = cvs_rev.lod + parent_node = self._mkdir_p(parent_path, lod) + parent_node.add_file(cvs_file) + self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True)) + + def copy_lod(self, src_lod, dest_lod, src_revnum): + """Copy all of SRC_LOD at SRC_REVNUM to DST_LOD. + + In the youngest revision of the repository, the destination LOD + *must not* already exist. + + Return the new node at DEST_LOD. Note that this node is not + necessarily writable, though its parent node necessarily is.""" + + node = self._mirror.copy_lod(src_lod, dest_lod, src_revnum) + self._invoke_delegates('copy_lod', src_lod, dest_lod, src_revnum) + return node + + def copy_path( + self, cvs_path, src_lod, dest_lod, src_revnum, create_parent=False + ): + """Copy CVS_PATH from SRC_LOD at SRC_REVNUM to DST_LOD. + + In the youngest revision of the repository, the destination's + parent *must* exist unless CREATE_PARENT is specified. But the + destination itself *must not* exist. + + Return the new node at (CVS_PATH, DEST_LOD), as a + CurrentMirrorDirectory.""" + + if cvs_path.parent_directory is None: + return self.copy_lod(src_lod, dest_lod, src_revnum) + + # Get the node of our source, or None if it is a file: + src_node = self._mirror.get_old_path(cvs_path, src_lod, src_revnum) + + # Get the parent path of the destination: + if create_parent: + dest_parent_node = self._mkdir_p(cvs_path.parent_directory, dest_lod) + else: + try: + dest_parent_node = self._mirror.get_current_path( + cvs_path.parent_directory, dest_lod + ) + except KeyError: + raise self.ParentMissingError( + 'Attempt to add path \'%s\' to repository mirror, ' + 'but its parent directory doesn\'t exist in the mirror.' + % (dest_lod.get_path(cvs_path.cvs_path),) + ) + + if cvs_path in dest_parent_node: + raise PathExistsError( + 'Attempt to add path \'%s\' to repository mirror ' + 'when it already exists in the mirror.' + % (dest_lod.get_path(cvs_path.cvs_path),) + ) + + dest_parent_node[cvs_path] = src_node + self._invoke_delegates( + 'copy_path', + cvs_path, src_lod, dest_lod, src_revnum + ) + + return dest_parent_node[cvs_path] + + def fill_symbol(self, svn_symbol_commit, fill_source): + """Perform all copies for the CVSSymbols in SVN_SYMBOL_COMMIT. + + The symbolic name is guaranteed to exist in the Subversion + repository by the end of this call, even if there are no paths + under it.""" + + symbol = svn_symbol_commit.symbol + + try: + dest_node = self._mirror.get_current_lod_directory(symbol) + except KeyError: + self._fill_directory(symbol, None, fill_source, None) + else: + self._fill_directory(symbol, dest_node, fill_source, None) + + def _fill_directory(self, symbol, dest_node, fill_source, parent_source): + """Fill the tag or branch SYMBOL at the path indicated by FILL_SOURCE. + + Use items from FILL_SOURCE, and recurse into the child items. + + Fill SYMBOL starting at the path FILL_SOURCE.cvs_path. DEST_NODE + is the node of this destination path, or None if the destination + does not yet exist. All directories above this path have already + been filled. FILL_SOURCE is a FillSource instance describing the + items within a subtree of the repository that still need to be + copied to the destination. + + PARENT_SOURCE is the SVNRevisionRange that was used to copy the + parent directory, if it was copied in this commit. We prefer to + copy from the same source as was used for the parent, since it + typically requires less touching-up. If PARENT_SOURCE is None, + then the parent directory was not copied in this commit, so no + revision is preferable to any other.""" + + copy_source = fill_source.compute_best_source(parent_source) + + # Figure out if we shall copy to this destination and delete any + # destination path that is in the way. + if dest_node is None: + # The destination does not exist at all, so it definitely has to + # be copied: + dest_node = self.copy_path( + fill_source.cvs_path, copy_source.source_lod, + symbol, copy_source.opening_revnum + ) + elif (parent_source is not None) and ( + copy_source.source_lod != parent_source.source_lod + or copy_source.opening_revnum != parent_source.opening_revnum + ): + # The parent path was copied from a different source than we + # need to use, so we have to delete the version that was copied + # with the parent then re-copy from the correct source: + self.delete_path(fill_source.cvs_path, symbol) + dest_node = self.copy_path( + fill_source.cvs_path, copy_source.source_lod, + symbol, copy_source.opening_revnum + ) + else: + copy_source = parent_source + + # The map {CVSPath : FillSource} of entries within this directory + # that need filling: + src_entries = fill_source.get_subsource_map() + + if copy_source is not None: + self._prune_extra_entries( + fill_source.cvs_path, symbol, dest_node, src_entries + ) + + return self._cleanup_filled_directory( + symbol, dest_node, src_entries, copy_source + ) + + def _cleanup_filled_directory( + self, symbol, dest_node, src_entries, copy_source + ): + """The directory at DEST_NODE has been filled and pruned; recurse. + + Recurse into the SRC_ENTRIES, in alphabetical order. If DEST_NODE + was copied in this revision, COPY_SOURCE should indicate where it + was copied from; otherwise, COPY_SOURCE should be None.""" + + cvs_paths = src_entries.keys() + cvs_paths.sort() + for cvs_path in cvs_paths: + if isinstance(cvs_path, CVSDirectory): + # Path is a CVSDirectory: + try: + dest_subnode = dest_node[cvs_path] + except KeyError: + # Path doesn't exist yet; it has to be created: + dest_node = self._fill_directory( + symbol, None, src_entries[cvs_path], None + ).parent_mirror_dir + else: + # Path already exists, but might have to be cleaned up: + dest_node = self._fill_directory( + symbol, dest_subnode, src_entries[cvs_path], copy_source + ).parent_mirror_dir + else: + # Path is a CVSFile: + self._fill_file( + symbol, cvs_path in dest_node, src_entries[cvs_path], copy_source + ) + # Reread dest_node since the call to _fill_file() might have + # made it writable: + dest_node = self._mirror.get_current_path( + dest_node.cvs_path, dest_node.lod + ) + + return dest_node + + def _fill_file(self, symbol, dest_existed, fill_source, parent_source): + """Fill the tag or branch SYMBOL at the path indicated by FILL_SOURCE. + + Use items from FILL_SOURCE. + + Fill SYMBOL at path FILL_SOURCE.cvs_path. DEST_NODE is the node + of this destination path, or None if the destination does not yet + exist. All directories above this path have already been filled + as needed. FILL_SOURCE is a FillSource instance describing the + item that needs to be copied to the destination. + + PARENT_SOURCE is the source from which the parent directory was + copied, or None if the parent directory was not copied during this + commit. We prefer to copy from PARENT_SOURCE, since it typically + requires less touching-up. If PARENT_SOURCE is None, then the + parent directory was not copied in this commit, so no revision is + preferable to any other.""" + + copy_source = fill_source.compute_best_source(parent_source) + + # Figure out if we shall copy to this destination and delete any + # destination path that is in the way. + if not dest_existed: + # The destination does not exist at all, so it definitely has to + # be copied: + self.copy_path( + fill_source.cvs_path, copy_source.source_lod, + symbol, copy_source.opening_revnum + ) + elif (parent_source is not None) and ( + copy_source.source_lod != parent_source.source_lod + or copy_source.opening_revnum != parent_source.opening_revnum + ): + # The parent path was copied from a different source than we + # need to use, so we have to delete the version that was copied + # with the parent and then re-copy from the correct source: + self.delete_path(fill_source.cvs_path, symbol) + self.copy_path( + fill_source.cvs_path, copy_source.source_lod, + symbol, copy_source.opening_revnum + ) + + def _prune_extra_entries( + self, dest_cvs_path, symbol, dest_node, src_entries + ): + """Delete any entries in DEST_NODE that are not in SRC_ENTRIES.""" + + delete_list = [ + cvs_path + for cvs_path in dest_node + if cvs_path not in src_entries + ] + + # Sort the delete list so that the output is in a consistent + # order: + delete_list.sort() + for cvs_path in delete_list: + del dest_node[cvs_path] + self._invoke_delegates('delete_path', symbol, cvs_path) + + def add_delegate(self, delegate): + """Adds DELEGATE to self._delegates. + + For every delegate you add, whenever a repository action method is + performed, delegate's corresponding repository action method is + called. Multiple delegates will be called in the order that they + are added. See SVNRepositoryDelegate for more information.""" + + self._delegates.append(delegate) + + def _invoke_delegates(self, method, *args): + """Invoke a method on each delegate. + + Iterate through each of our delegates, in the order that they were + added, and call the delegate's method named METHOD with the + arguments in ARGS.""" + + for delegate in self._delegates: + getattr(delegate, method)(*args) + + def process_initial_project_commit(self, svn_commit): + self.start_commit(svn_commit.revnum, self._get_revprops(svn_commit)) + + for project in svn_commit.projects: + self.initialize_project(project) + + self.end_commit() + + def process_primary_commit(self, svn_commit): + self.start_commit(svn_commit.revnum, self._get_revprops(svn_commit)) + + # This actually commits CVSRevisions + if len(svn_commit.cvs_revs) > 1: + plural = "s" + else: + plural = "" + Log().verbose("Committing %d CVSRevision%s" + % (len(svn_commit.cvs_revs), plural)) + for cvs_rev in svn_commit.cvs_revs: + if isinstance(cvs_rev, CVSRevisionNoop): + pass + + elif isinstance(cvs_rev, CVSRevisionDelete): + self.delete_path(cvs_rev.cvs_file, cvs_rev.lod, Ctx().prune) + + elif isinstance(cvs_rev, CVSRevisionAdd): + self.add_path(cvs_rev) + + elif isinstance(cvs_rev, CVSRevisionChange): + self.change_path(cvs_rev) + + self.end_commit() + + def process_post_commit(self, svn_commit): + self.start_commit(svn_commit.revnum, self._get_revprops(svn_commit)) + + Log().verbose( + 'Synchronizing default branch motivated by %d' + % (svn_commit.motivating_revnum,) + ) + + for cvs_rev in svn_commit.cvs_revs: + trunk = cvs_rev.cvs_file.project.get_trunk() + if isinstance(cvs_rev, CVSRevisionAdd): + # Copy from branch to trunk: + self.copy_path( + cvs_rev.cvs_file, cvs_rev.lod, trunk, + svn_commit.motivating_revnum, True + ) + elif isinstance(cvs_rev, CVSRevisionChange): + # Delete old version of the path on trunk... + self.delete_path(cvs_rev.cvs_file, trunk) + # ...and copy the new version over from branch: + self.copy_path( + cvs_rev.cvs_file, cvs_rev.lod, trunk, + svn_commit.motivating_revnum, True + ) + elif isinstance(cvs_rev, CVSRevisionDelete): + # Delete trunk path: + self.delete_path(cvs_rev.cvs_file, trunk) + elif isinstance(cvs_rev, CVSRevisionNoop): + # Do nothing + pass + else: + raise InternalError('Unexpected CVSRevision type: %s' % (cvs_rev,)) + + self.end_commit() + + def process_branch_commit(self, svn_commit): + self.start_commit(svn_commit.revnum, self._get_revprops(svn_commit)) + Log().verbose('Filling branch:', svn_commit.symbol.name) + + # Get the set of sources for the symbolic name: + source_set = get_source_set( + svn_commit.symbol, + self._symbolings_reader.get_range_map(svn_commit), + ) + + self.fill_symbol(svn_commit, source_set) + + self.end_commit() + + def process_tag_commit(self, svn_commit): + self.start_commit(svn_commit.revnum, self._get_revprops(svn_commit)) + Log().verbose('Filling tag:', svn_commit.symbol.name) + + # Get the set of sources for the symbolic name: + source_set = get_source_set( + svn_commit.symbol, + self._symbolings_reader.get_range_map(svn_commit), + ) + + self.fill_symbol(svn_commit, source_set) + + self.end_commit() + + def cleanup(self): + self._invoke_delegates('finish') + self._mirror.close() + self._mirror = None + Ctx().revision_reader.finish() + self._symbolings_reader.close() + del self._symbolings_reader + + +class DumpfileOutputOption(SVNOutputOption): + """Output the result of the conversion into a dumpfile.""" + + def __init__(self, dumpfile_path, author_transforms=None): + SVNOutputOption.__init__(self, author_transforms) + self.dumpfile_path = dumpfile_path + + def check(self): + pass + + def setup(self, svn_rev_count): + Log().quiet("Starting Subversion Dumpfile.") + SVNOutputOption.setup(self, svn_rev_count) + if not Ctx().dry_run: + self.add_delegate( + DumpfileDelegate(Ctx().revision_reader, self.dumpfile_path) + ) + + +class RepositoryOutputOption(SVNOutputOption): + """Output the result of the conversion into an SVN repository.""" + + def __init__(self, target, author_transforms=None): + SVNOutputOption.__init__(self, author_transforms) + self.target = target + + def check(self): + if not Ctx().dry_run: + # Verify that svnadmin can be executed. The 'help' subcommand + # should be harmless. + try: + check_command_runs([Ctx().svnadmin_executable, 'help'], 'svnadmin') + except CommandFailedException, e: + raise FatalError( + '%s\n' + 'svnadmin could not be executed. Please ensure that it is\n' + 'installed and/or use the --svnadmin option.' % (e,)) + + def setup(self, svn_rev_count): + Log().quiet("Starting Subversion Repository.") + SVNOutputOption.setup(self, svn_rev_count) + if not Ctx().dry_run: + self.add_delegate( + RepositoryDelegate(Ctx().revision_reader, self.target) + ) + + +class NewRepositoryOutputOption(RepositoryOutputOption): + """Output the result of the conversion into a new SVN repository.""" + + def __init__( + self, target, fs_type=None, bdb_txn_nosync=None, author_transforms=None, create_options=[] + ): + RepositoryOutputOption.__init__(self, target, author_transforms) + self.bdb_txn_nosync = bdb_txn_nosync + + # Determine the options to be passed to "svnadmin create": + if not fs_type: + # User didn't say what kind repository (bdb, fsfs, etc). We + # still pass --bdb-txn-nosync. It's a no-op if the default + # repository type doesn't support it, but we definitely want it + # if BDB is the default. + self.create_options = ['--bdb-txn-nosync'] + elif fs_type == 'bdb': + # User explicitly specified bdb. + # + # Since this is a BDB repository, pass --bdb-txn-nosync, because + # it gives us a 4-5x speed boost (if cvs2svn is creating the + # repository, cvs2svn should be the only program accessing the + # svn repository until cvs2svn is done). But we'll turn no-sync + # off in self.finish(), unless instructed otherwise. + self.create_options = ['--fs-type=bdb', '--bdb-txn-nosync'] + else: + # User specified something other than bdb. + self.create_options = ['--fs-type=%s' % fs_type] + + # Now append the user's explicitly-set create options: + self.create_options += create_options + + def check(self): + RepositoryOutputOption.check(self) + if not Ctx().dry_run and os.path.exists(self.target): + raise FatalError("the svn-repos-path '%s' exists.\n" + "Remove it, or pass '--existing-svnrepos'." + % self.target) + + def setup(self, svn_rev_count): + Log().normal("Creating new repository '%s'" % (self.target)) + if Ctx().dry_run: + # Do not actually create repository: + pass + else: + call_command([ + Ctx().svnadmin_executable, 'create', + ] + self.create_options + [ + self.target + ]) + + RepositoryOutputOption.setup(self, svn_rev_count) + + def cleanup(self): + RepositoryOutputOption.cleanup(self) + + # If this is a BDB repository, and we created the repository, and + # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC + # line in the DB_CONFIG file, because txn syncing should be on by + # default in BDB repositories. + # + # We determine if this is a BDB repository by looking for the + # DB_CONFIG file, which doesn't exist in FSFS, rather than by + # checking self.fs_type. That way this code will Do The Right + # Thing in all circumstances. + db_config = os.path.join(self.target, "db/DB_CONFIG") + if Ctx().dry_run: + # Do not change repository: + pass + elif not self.bdb_txn_nosync and os.path.exists(db_config): + no_sync = 'set_flags DB_TXN_NOSYNC\n' + + contents = open(db_config, 'r').readlines() + index = contents.index(no_sync) + contents[index] = '# ' + no_sync + open(db_config, 'w').writelines(contents) + + +class ExistingRepositoryOutputOption(RepositoryOutputOption): + """Output the result of the conversion into an existing SVN repository.""" + + def __init__(self, target, author_transforms=None): + RepositoryOutputOption.__init__(self, target, author_transforms) + + def check(self): + RepositoryOutputOption.check(self) + if not os.path.isdir(self.target): + raise FatalError("the svn-repos-path '%s' is not an " + "existing directory." % self.target) + + diff --git a/cvs2svn_lib/svn_repository_delegate.py b/cvs2svn_lib/svn_repository_delegate.py new file mode 100644 index 0000000..00c4a01 --- /dev/null +++ b/cvs2svn_lib/svn_repository_delegate.py @@ -0,0 +1,121 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the SVNRepositoryDelegate class.""" + + +class SVNRepositoryDelegate: + """Abstract superclass for any delegate to SVNOutputOption. + + Subclasses must implement all of the methods below. + + For each method, a subclass implements, in its own way, the + Subversion operation implied by the method's name. For example, for + the add_path method, the DumpfileDelegate would write out a + 'Node-add:' command to a Subversion dumpfile, the StdoutDelegate + would merely print that the path is being added to the repository, + and the RepositoryDelegate would actually cause the path to be added + to the Subversion repository that it is creating.""" + + def start_commit(self, revnum, revprops): + """An SVN commit is starting. + + Perform any actions needed to start an SVN commit with revision + number REVNUM and revision properties REVPROPS.""" + + raise NotImplementedError() + + def end_commit(self): + """An SVN commit is ending.""" + + raise NotImplementedError() + + def initialize_project(self, project): + """Initialize PROJECT. + + For Subversion, this means to create the trunk, branches, and tags + directories for PROJECT.""" + + raise NotImplementedError() + + def initialize_lod(self, lod): + """Initialize LOD with no contents. + + LOD is an instance of LineOfDevelopment. It is also possible for + an LOD to be created by copying from another LOD; such events are + indicated via the copy_lod() callback.""" + + raise NotImplementedError() + + def mkdir(self, lod, cvs_directory): + """Create CVS_DIRECTORY within LOD. + + LOD is a LineOfDevelopment; CVS_DIRECTORY is a CVSDirectory.""" + + raise NotImplementedError() + + def add_path(self, s_item): + """Add the path corresponding to S_ITEM to the repository. + + S_ITEM is an SVNCommitItem.""" + + raise NotImplementedError() + + def change_path(self, s_item): + """Change the path corresponding to S_ITEM in the repository. + + S_ITEM is an SVNCommitItem.""" + + raise NotImplementedError() + + def delete_lod(self, lod): + """Delete LOD from the repository. + + LOD is a LineOfDevelopment instance.""" + + raise NotImplementedError() + + def delete_path(self, lod, cvs_path): + """Delete CVS_PATH from LOD. + + LOD is a LineOfDevelopment; CVS_PATH is a CVSPath.""" + + raise NotImplementedError() + + def copy_lod(self, src_lod, dest_lod, src_revnum): + """Copy SRC_LOD in SRC_REVNUM to DEST_LOD. + + SRC_LOD and DEST_LOD are both LODs, and SRC_REVNUM is a subversion + revision number (int).""" + + raise NotImplementedError() + + def copy_path(self, cvs_path, src_lod, dest_lod, src_revnum): + """Copy CVS_PATH in SRC_LOD@SRC_REVNUM to DEST_LOD. + + CVS_PATH is a CVSPath, SRC_LOD and DEST_LOD are LODs, and + SRC_REVNUM is a subversion revision number (int).""" + + raise NotImplementedError() + + def finish(self): + """All SVN revisions have been committed. + + Perform any necessary cleanup.""" + + raise NotImplementedError() + + diff --git a/cvs2svn_lib/svn_revision_range.py b/cvs2svn_lib/svn_revision_range.py new file mode 100644 index 0000000..04ba7fa --- /dev/null +++ b/cvs2svn_lib/svn_revision_range.py @@ -0,0 +1,171 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the SVNRevisionRange class.""" + + +import bisect + +from cvs2svn_lib.common import SVN_INVALID_REVNUM + + +class SVNRevisionRange: + """The range of subversion revision numbers from which a path can be + copied. self.opening_revnum is the number of the earliest such + revision, and self.closing_revnum is one higher than the number of + the last such revision. If self.closing_revnum is None, then no + closings were registered.""" + + def __init__(self, source_lod, opening_revnum): + self.source_lod = source_lod + self.opening_revnum = opening_revnum + self.closing_revnum = None + + def add_closing(self, closing_revnum): + # When we have a non-trunk default branch, we may have multiple + # closings--only register the first closing we encounter. + if self.closing_revnum is None: + self.closing_revnum = closing_revnum + + def __contains__(self, revnum): + """Return True iff REVNUM is contained in the range.""" + + return ( + self.opening_revnum <= revnum \ + and (self.closing_revnum is None or revnum < self.closing_revnum) + ) + + def __str__(self): + if self.closing_revnum is None: + return '[%d:]' % (self.opening_revnum,) + else: + return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,) + + def __repr__(self): + return str(self) + + +class RevisionScores: + """Represent the scores for a range of revisions.""" + + def __init__(self, svn_revision_ranges): + """Initialize based on SVN_REVISION_RANGES. + + SVN_REVISION_RANGES is a list of SVNRevisionRange objects. + + The score of an svn source is defined to be the number of + SVNRevisionRanges on that LOD that include the revision. A score + thus indicates that copying the corresponding revision (or any + following revision up to the next revision in the list) of the + object in question would yield that many correct paths at or + underneath the object. There may be other paths underneath it + that are not correct and would need to be deleted or recopied; + those can only be detected by descending and examining their + scores. + + If SVN_REVISION_RANGES is empty, then all scores are undefined.""" + + deltas_map = {} + + for range in svn_revision_ranges: + source_lod = range.source_lod + try: + deltas = deltas_map[source_lod] + except: + deltas = [] + deltas_map[source_lod] = deltas + deltas.append((range.opening_revnum, +1)) + if range.closing_revnum is not None: + deltas.append((range.closing_revnum, -1)) + + # A map: + # + # {SOURCE_LOD : [(REV1 SCORE1), (REV2 SCORE2), (REV3 SCORE3), ...]} + # + # where the tuples are sorted by revision number and the revision + # numbers are distinct. Score is the number of correct paths that + # would result from using the specified SOURCE_LOD and revision + # number (or any other revision preceding the next revision + # listed) as a source. For example, the score of any revision REV + # in the range REV2 <= REV < REV3 is equal to SCORE2. + self._scores_map = {} + + for (source_lod,deltas) in deltas_map.items(): + # Sort by revision number: + deltas.sort() + + # Initialize output list with zeroth element of deltas. This + # element must exist, because it was verified that + # svn_revision_ranges (and therefore openings) is not empty. + scores = [ deltas[0] ] + total = deltas[0][1] + for (rev, change) in deltas[1:]: + total += change + if rev == scores[-1][0]: + # Same revision as last entry; modify last entry: + scores[-1] = (rev, total) + else: + # Previously-unseen revision; create new entry: + scores.append((rev, total)) + self._scores_map[source_lod] = scores + + def get_score(self, range): + """Return the score for RANGE's opening revision. + + If RANGE doesn't appear explicitly in self.scores, use the score + of the higest revision preceding RANGE. If there are no preceding + revisions, then the score for RANGE is unknown; in this case, + return -1.""" + + try: + scores = self._scores_map[range.source_lod] + except KeyError: + return -1 + + # Remember, according to the tuple sorting rules, + # + # (revnum, anything,) < (revnum+1,) < (revnum+1, anything,) + predecessor_index = bisect.bisect_right( + scores, (range.opening_revnum + 1,) + ) - 1 + + if predecessor_index < 0: + return -1 + + return scores[predecessor_index][1] + + def get_best_revnum(self): + """Find the revnum with the highest score. + + Return (revnum, score) for the revnum with the highest score. If + the highest score is shared by multiple revisions, select the + oldest revision.""" + + best_source_lod = None + best_revnum = SVN_INVALID_REVNUM + best_score = 0 + + source_lods = self._scores_map.keys() + source_lods.sort() + for source_lod in source_lods: + for revnum, score in self._scores_map[source_lod]: + if score > best_score: + best_source_lod = source_lod + best_score = score + best_revnum = revnum + return best_source_lod, best_revnum, best_score + + diff --git a/cvs2svn_lib/svn_run_options.py b/cvs2svn_lib/svn_run_options.py new file mode 100644 index 0000000..e757730 --- /dev/null +++ b/cvs2svn_lib/svn_run_options.py @@ -0,0 +1,543 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module manages cvs2svn run options.""" + + +import sys +import optparse +import datetime +import codecs + +from cvs2svn_lib.version import VERSION +from cvs2svn_lib import config +from cvs2svn_lib.common import warning_prefix +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import normalize_svn_path +from cvs2svn_lib.log import Log +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.run_options import not_both +from cvs2svn_lib.run_options import RunOptions +from cvs2svn_lib.run_options import ContextOption +from cvs2svn_lib.run_options import IncompatibleOption +from cvs2svn_lib.run_options import authors +from cvs2svn_lib.man_writer import ManWriter +from cvs2svn_lib.project import Project +from cvs2svn_lib.svn_output_option import DumpfileOutputOption +from cvs2svn_lib.svn_output_option import ExistingRepositoryOutputOption +from cvs2svn_lib.svn_output_option import NewRepositoryOutputOption +from cvs2svn_lib.revision_manager import NullRevisionRecorder +from cvs2svn_lib.revision_manager import NullRevisionExcluder +from cvs2svn_lib.rcs_revision_manager import RCSRevisionReader +from cvs2svn_lib.cvs_revision_manager import CVSRevisionReader +from cvs2svn_lib.checkout_internal import InternalRevisionRecorder +from cvs2svn_lib.checkout_internal import InternalRevisionExcluder +from cvs2svn_lib.checkout_internal import InternalRevisionReader +from cvs2svn_lib.symbol_strategy import TrunkPathRule +from cvs2svn_lib.symbol_strategy import BranchesPathRule +from cvs2svn_lib.symbol_strategy import TagsPathRule + + +short_desc = 'convert a cvs repository into a subversion repository' + +synopsis = """\ +.B cvs2svn +[\\fIOPTION\\fR]... \\fIOUTPUT-OPTION CVS-REPOS-PATH\\fR +.br +.B cvs2svn +[\\fIOPTION\\fR]... \\fI--options=PATH\\fR +""" + +long_desc = """\ +Create a new Subversion repository based on the version history stored in a +CVS repository. Each CVS commit will be mirrored in the Subversion +repository, including such information as date of commit and id of the +committer. +.P +\\fICVS-REPOS-PATH\\fR is the filesystem path of the part of the CVS +repository that you want to convert. It is not possible to convert a +CVS repository to which you only have remote access; see the FAQ for +more information. This path doesn't have to be the top level +directory of a CVS repository; it can point at a project within a +repository, in which case only that project will be converted. This +path or one of its parent directories has to contain a subdirectory +called CVSROOT (though the CVSROOT directory can be empty). +.P +Multiple CVS repositories can be converted into a single Subversion +repository in a single run of cvs2svn, but only by using an +\\fB--options\\fR file. +""" + +files = """\ +A directory called \\fIcvs2svn-tmp\\fR (or the directory specified by +\\fB--tmpdir\\fR) is used as scratch space for temporary data files. +""" + +see_also = [ + ('cvs', '1'), + ('svn', '1'), + ('svnadmin', '1'), + ] + + +class SVNRunOptions(RunOptions): + def _get_output_options_group(self): + group = RunOptions._get_output_options_group(self) + + group.add_option(IncompatibleOption( + '--svnrepos', '-s', type='string', + action='store', + help='path where SVN repos should be created', + man_help=( + 'Write the output of the conversion into a Subversion repository ' + 'located at \\fIpath\\fR. This option causes a new Subversion ' + 'repository to be created at \\fIpath\\fR unless the ' + '\\fB--existing-svnrepos\\fR option is also used.' + ), + metavar='PATH', + )) + self.parser.set_default('existing_svnrepos', False) + group.add_option(IncompatibleOption( + '--existing-svnrepos', + action='store_true', + help='load into existing SVN repository (for use with --svnrepos)', + man_help=( + 'Load the converted CVS repository into an existing Subversion ' + 'repository, instead of creating a new repository. (This option ' + 'should be used in combination with ' + '\\fB-s\\fR/\\fB--svnrepos\\fR.) The repository must either be ' + 'empty or contain no paths that overlap with those that will ' + 'result from the conversion. Please note that you need write ' + 'permission for the repository files.' + ), + )) + group.add_option(IncompatibleOption( + '--fs-type', type='string', + action='store', + help=( + 'pass --fs-type=TYPE to "svnadmin create" (for use with ' + '--svnrepos)' + ), + man_help=( + 'Pass \\fI--fs-type\\fR=\\fItype\\fR to "svnadmin create" when ' + 'creating a new repository.' + ), + metavar='TYPE', + )) + self.parser.set_default('bdb_txn_nosync', False) + group.add_option(IncompatibleOption( + '--bdb-txn-nosync', + action='store_true', + help=( + 'pass --bdb-txn-nosync to "svnadmin create" (for use with ' + '--svnrepos)' + ), + man_help=( + 'Pass \\fI--bdb-txn-nosync\\fR to "svnadmin create" when ' + 'creating a new BDB-style Subversion repository.' + ), + )) + self.parser.set_default('create_options', []) + group.add_option(IncompatibleOption( + '--create-option', type='string', + action='append', dest='create_options', + help='pass OPT to "svnadmin create" (for use with --svnrepos)', + man_help=( + 'Pass \\fIopt\\fR to "svnadmin create" when creating a new ' + 'Subversion repository (can be specified multiple times to ' + 'pass multiple options).' + ), + metavar='OPT', + )) + group.add_option(IncompatibleOption( + '--dumpfile', type='string', + action='store', + help='just produce a dumpfile; don\'t commit to a repos', + man_help=( + 'Just produce a dumpfile; don\'t commit to an SVN repository. ' + 'Write the dumpfile to \\fIpath\\fR.' + ), + metavar='PATH', + )) + + group.add_option(ContextOption( + '--dry-run', + action='store_true', + help=( + 'do not create a repository or a dumpfile; just print what ' + 'would happen.' + ), + man_help=( + 'Do not create a repository or a dumpfile; just print the ' + 'details of what cvs2svn would do if it were really converting ' + 'your repository.' + ), + )) + + # Deprecated options: + self.parser.set_default('dump_only', False) + group.add_option(IncompatibleOption( + '--dump-only', + action='callback', callback=self.callback_dump_only, + help=optparse.SUPPRESS_HELP, + man_help=optparse.SUPPRESS_HELP, + )) + group.add_option(IncompatibleOption( + '--create', + action='callback', callback=self.callback_create, + help=optparse.SUPPRESS_HELP, + man_help=optparse.SUPPRESS_HELP, + )) + + return group + + def _get_conversion_options_group(self): + group = RunOptions._get_conversion_options_group(self) + + self.parser.set_default('trunk_base', config.DEFAULT_TRUNK_BASE) + group.add_option(IncompatibleOption( + '--trunk', type='string', + action='store', dest='trunk_base', + help=( + 'path for trunk (default: %s)' + % (config.DEFAULT_TRUNK_BASE,) + ), + man_help=( + 'Set the top-level path to use for trunk in the Subversion ' + 'repository. The default is \\fI%s\\fR.' + % (config.DEFAULT_TRUNK_BASE,) + ), + metavar='PATH', + )) + self.parser.set_default('branches_base', config.DEFAULT_BRANCHES_BASE) + group.add_option(IncompatibleOption( + '--branches', type='string', + action='store', dest='branches_base', + help=( + 'path for branches (default: %s)' + % (config.DEFAULT_BRANCHES_BASE,) + ), + man_help=( + 'Set the top-level path to use for branches in the Subversion ' + 'repository. The default is \\fI%s\\fR.' + % (config.DEFAULT_BRANCHES_BASE,) + ), + metavar='PATH', + )) + self.parser.set_default('tags_base', config.DEFAULT_TAGS_BASE) + group.add_option(IncompatibleOption( + '--tags', type='string', + action='store', dest='tags_base', + help=( + 'path for tags (default: %s)' + % (config.DEFAULT_TAGS_BASE,) + ), + man_help=( + 'Set the top-level path to use for tags in the Subversion ' + 'repository. The default is \\fI%s\\fR.' + % (config.DEFAULT_TAGS_BASE,) + ), + metavar='PATH', + )) + group.add_option(ContextOption( + '--no-prune', + action='store_false', dest='prune', + help='don\'t prune empty directories', + man_help=( + 'When all files are deleted from a directory in the Subversion ' + 'repository, don\'t delete the empty directory (the default is ' + 'to delete any empty directories).' + ), + )) + group.add_option(ContextOption( + '--no-cross-branch-commits', + action='store_false', dest='cross_branch_commits', + help='prevent the creation of cross-branch commits', + man_help=( + 'Prevent the creation of commits that affect files on multiple ' + 'branches at once.' + ), + )) + + return group + + def _get_extraction_options_group(self): + group = RunOptions._get_extraction_options_group(self) + + self.parser.set_default('use_internal_co', False) + group.add_option(IncompatibleOption( + '--use-internal-co', + action='store_true', + help=( + 'use internal code to extract revision contents ' + '(fastest but disk space intensive) (default)' + ), + man_help=( + 'Use internal code to extract revision contents. This ' + 'is up to 50% faster than using \\fB--use-rcs\\fR, but needs ' + 'a lot of disk space: roughly the size of your CVS repository ' + 'plus the peak size of a complete checkout of the repository ' + 'with all branches that existed and still had commits pending ' + 'at a given time. This option is the default.' + ), + )) + self.parser.set_default('use_cvs', False) + group.add_option(IncompatibleOption( + '--use-cvs', + action='store_true', + help=( + 'use CVS to extract revision contents (slower than ' + '--use-internal-co or --use-rcs)' + ), + man_help=( + 'Use CVS to extract revision contents. This option is slower ' + 'than \\fB--use-internal-co\\fR or \\fB--use-rcs\\fR.' + ), + )) + self.parser.set_default('use_rcs', False) + group.add_option(IncompatibleOption( + '--use-rcs', + action='store_true', + help=( + 'use RCS to extract revision contents (faster than ' + '--use-cvs but fails in some cases)' + ), + man_help=( + 'Use RCS \'co\' to extract revision contents. This option is ' + 'faster than \\fB--use-cvs\\fR but fails in some cases.' + ), + )) + + return group + + def _get_environment_options_group(self): + group = RunOptions._get_environment_options_group(self) + + group.add_option(ContextOption( + '--svnadmin', type='string', + action='store', dest='svnadmin_executable', + help='path to the "svnadmin" program', + man_help=( + 'Path to the \\fIsvnadmin\\fR program. (\\fIsvnadmin\\fR is ' + 'needed when the \\fB-s\\fR/\\fB--svnrepos\\fR output option is ' + 'used.)' + ), + metavar='PATH', + )) + + return group + + def callback_dump_only(self, option, opt_str, value, parser): + parser.values.dump_only = True + Log().error( + warning_prefix + + ': The --dump-only option is deprecated (it is implied ' + 'by --dumpfile).\n' + ) + + def callback_create(self, option, opt_str, value, parser): + Log().error( + warning_prefix + + ': The behaviour produced by the --create option is now the ' + 'default;\n' + 'passing the option is deprecated.\n' + ) + + def callback_manpage(self, option, opt_str, value, parser): + f = codecs.getwriter('utf_8')(sys.stdout) + ManWriter( + parser, + section='1', + date=datetime.date.today(), + source='Version %s' % (VERSION,), + manual='User Commands', + short_desc=short_desc, + synopsis=synopsis, + long_desc=long_desc, + files=files, + authors=authors, + see_also=see_also, + ).write_manpage(f) + sys.exit(0) + + def process_extraction_options(self): + """Process options related to extracting data from the CVS repository.""" + + ctx = Ctx() + options = self.options + + not_both(options.use_rcs, '--use-rcs', + options.use_cvs, '--use-cvs') + + not_both(options.use_rcs, '--use-rcs', + options.use_internal_co, '--use-internal-co') + + not_both(options.use_cvs, '--use-cvs', + options.use_internal_co, '--use-internal-co') + + if options.use_rcs: + ctx.revision_recorder = NullRevisionRecorder() + ctx.revision_excluder = NullRevisionExcluder() + ctx.revision_reader = RCSRevisionReader(options.co_executable) + elif options.use_cvs: + ctx.revision_recorder = NullRevisionRecorder() + ctx.revision_excluder = NullRevisionExcluder() + ctx.revision_reader = CVSRevisionReader(options.cvs_executable) + else: + # --use-internal-co is the default: + ctx.revision_recorder = InternalRevisionRecorder(compress=True) + ctx.revision_excluder = InternalRevisionExcluder() + ctx.revision_reader = InternalRevisionReader(compress=True) + + def process_output_options(self): + """Process the options related to SVN output.""" + + ctx = Ctx() + options = self.options + + if options.dump_only and not options.dumpfile: + raise FatalError("'--dump-only' requires '--dumpfile' to be specified.") + + if not options.svnrepos and not options.dumpfile and not ctx.dry_run: + raise FatalError("must pass one of '-s' or '--dumpfile'.") + + not_both(options.svnrepos, '-s', + options.dumpfile, '--dumpfile') + + not_both(options.dumpfile, '--dumpfile', + options.existing_svnrepos, '--existing-svnrepos') + + not_both(options.bdb_txn_nosync, '--bdb-txn-nosync', + options.existing_svnrepos, '--existing-svnrepos') + + not_both(options.dumpfile, '--dumpfile', + options.bdb_txn_nosync, '--bdb-txn-nosync') + + not_both(options.fs_type, '--fs-type', + options.existing_svnrepos, '--existing-svnrepos') + + if ( + options.fs_type + and options.fs_type != 'bdb' + and options.bdb_txn_nosync + ): + raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s." + % options.fs_type) + + if options.svnrepos: + if options.existing_svnrepos: + ctx.output_option = ExistingRepositoryOutputOption(options.svnrepos) + else: + ctx.output_option = NewRepositoryOutputOption( + options.svnrepos, + fs_type=options.fs_type, bdb_txn_nosync=options.bdb_txn_nosync, + create_options=options.create_options) + else: + ctx.output_option = DumpfileOutputOption(options.dumpfile) + + def add_project( + self, + project_cvs_repos_path, + trunk_path=None, branches_path=None, tags_path=None, + initial_directories=[], + symbol_transforms=None, + symbol_strategy_rules=[], + ): + """Add a project to be converted. + + Most arguments are passed straight through to the Project + constructor. SYMBOL_STRATEGY_RULES is an iterable of + SymbolStrategyRules that will be applied to symbols in this + project.""" + + if trunk_path is not None: + trunk_path = normalize_svn_path(trunk_path, allow_empty=True) + if branches_path is not None: + branches_path = normalize_svn_path(branches_path, allow_empty=False) + if tags_path is not None: + tags_path = normalize_svn_path(tags_path, allow_empty=False) + + initial_directories = [ + path + for path in [trunk_path, branches_path, tags_path] + if path + ] + [ + normalize_svn_path(path) + for path in initial_directories + ] + + symbol_strategy_rules = list(symbol_strategy_rules) + + # Add rules to set the SVN paths for LODs depending on whether + # they are the trunk, tags, or branches: + if trunk_path is not None: + symbol_strategy_rules.append(TrunkPathRule(trunk_path)) + if branches_path is not None: + symbol_strategy_rules.append(BranchesPathRule(branches_path)) + if tags_path is not None: + symbol_strategy_rules.append(TagsPathRule(tags_path)) + + id = len(self.projects) + project = Project( + id, + project_cvs_repos_path, + initial_directories=initial_directories, + symbol_transforms=symbol_transforms, + ) + + self.projects.append(project) + self.project_symbol_strategy_rules.append(symbol_strategy_rules) + + def clear_projects(self): + """Clear the list of projects to be converted. + + This method is for the convenience of options files, which may + want to import one another.""" + + del self.projects[:] + del self.project_symbol_strategy_rules[:] + + def process_options(self): + # Consistency check for options and arguments. + if len(self.args) == 0: + self.usage() + sys.exit(1) + + if len(self.args) > 1: + Log().error(error_prefix + ": must pass only one CVS repository.\n") + self.usage() + sys.exit(1) + + cvsroot = self.args[0] + + self.process_extraction_options() + self.process_output_options() + self.process_symbol_strategy_options() + self.process_property_setter_options() + + # Create the default project (using ctx.trunk, ctx.branches, and + # ctx.tags): + self.add_project( + cvsroot, + trunk_path=self.options.trunk_base, + branches_path=self.options.branches_base, + tags_path=self.options.tags_base, + symbol_transforms=self.options.symbol_transforms, + symbol_strategy_rules=self.options.symbol_strategy_rules, + ) + + diff --git a/cvs2svn_lib/symbol.py b/cvs2svn_lib/symbol.py new file mode 100644 index 0000000..e3a6b35 --- /dev/null +++ b/cvs2svn_lib/symbol.py @@ -0,0 +1,246 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes that represent trunk, branches, and tags. + +The classes in this module represent several concepts related to +symbols and lines of development in the abstract; that is, not within +a particular file, but across all files in a project. + +The classes in this module are organized into the following class +hierarchy: + +AbstractSymbol + | + +--LineOfDevelopment + | | + | +--Trunk + | | + | +--IncludedSymbol (also inherits from TypedSymbol) + | | + | +--Branch + | | + | +--Tag + | + +--Symbol + | + +--TypedSymbol + | + +--IncludedSymbol (also inherits from LineOfDevelopment) + | | + | +--Branch + | | + | +--Tag + | + +--ExcludedSymbol + +Please note the use of multiple inheritance. + +All AbstractSymbols contain an id that is globally unique across all +AbstractSymbols. Moreover, the id of an AbstractSymbol remains the +same even if the symbol is mutated (as described below), and two +AbstractSymbols are considered equal iff their ids are the same, even +if the two instances have different types. Symbols in different +projects always have different ids and are therefore always distinct. +(Indeed, this is pretty much the defining characteristic of a +project.) Even if, for example, two projects each have branches with +the same name, the Symbols representing the branches are distinct and +have distinct ids. (This is important to avoid having to rewrite +databases with new symbol ids in CollateSymbolsPass.) + +AbstractSymbols are all initially created in CollectRevsPass as either +Trunk or Symbol instances. A Symbol instance is essentially an +undifferentiated Symbol. + +In CollateSymbolsPass, it is decided which symbols will be converted +as branches, which as tags, and which excluded altogether. At the +beginning of this pass, the symbols are all represented by instances +of the non-specific Symbol class. During CollateSymbolsPass, each +Symbol instance is replaced by an instance of Branch, Tag, or +ExcludedSymbol with the same id. (Trunk instances are left +unchanged.) At the end of CollateSymbolsPass, all ExcludedSymbols are +discarded and processing continues with only Trunk, Branch, and Tag +instances. These three classes inherit from LineOfDevelopment; +therefore, in later passes the term LineOfDevelopment (abbreviated to +LOD) is used to refer to such objects.""" + + +from cvs2svn_lib.context import Ctx +from cvs2svn_lib.common import path_join + + +class AbstractSymbol: + """Base class for all other classes in this file.""" + + def __init__(self, id, project): + self.id = id + self.project = project + + def __hash__(self): + return self.id + + def __eq__(self, other): + return self.id == other.id + + +class LineOfDevelopment(AbstractSymbol): + """Base class for Trunk, Branch, and Tag. + + This is basically the abstraction for what will be a root tree in + the Subversion repository.""" + + def __init__(self, id, project): + AbstractSymbol.__init__(self, id, project) + self.base_path = None + + def get_path(self, *components): + """Return the svn path for this LineOfDevelopment.""" + + return path_join(self.base_path, *components) + + +class Trunk(LineOfDevelopment): + """Represent the main line of development.""" + + def __getstate__(self): + return (self.id, self.project.id, self.base_path,) + + def __setstate__(self, state): + (self.id, project_id, self.base_path,) = state + self.project = Ctx()._projects[project_id] + + def __cmp__(self, other): + if isinstance(other, Trunk): + return cmp(self.project, other.project) + elif isinstance(other, Symbol): + # Allow Trunk to compare less than Symbols: + return -1 + else: + raise NotImplementedError() + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return 'Trunk' + + def __repr__(self): + return '%s<%x>' % (self, self.id,) + + +class Symbol(AbstractSymbol): + """Represents a symbol within one project in the CVS repository. + + Instance of the Symbol class itself are used to represent symbols + from the CVS repository. CVS, of course, distinguishes between + normal tags and branch tags, but we allow symbol types to be changed + in CollateSymbolsPass. Therefore, we store all CVS symbols as + Symbol instances at the beginning of the conversion. + + In CollateSymbolsPass, Symbols are replaced by Branches, Tags, and + ExcludedSymbols (the latter being discarded at the end of that + pass).""" + + def __init__(self, id, project, name, preferred_parent_id=None): + AbstractSymbol.__init__(self, id, project) + self.name = name + + # If this symbol has a preferred parent, this member is the id of + # the LineOfDevelopment instance representing it. If the symbol + # never appeared in a CVSTag or CVSBranch (for example, because + # all of the branches on this LOD have been detached from the + # dependency tree), then this field is set to None. This field is + # set during FilterSymbolsPass. + self.preferred_parent_id = preferred_parent_id + + def __getstate__(self): + return (self.id, self.project.id, self.name, self.preferred_parent_id,) + + def __setstate__(self, state): + (self.id, project_id, self.name, self.preferred_parent_id,) = state + self.project = Ctx()._projects[project_id] + + def __cmp__(self, other): + if isinstance(other, Symbol): + return cmp(self.project, other.project) \ + or cmp(self.name, other.name) \ + or cmp(self.id, other.id) + elif isinstance(other, Trunk): + # Allow Symbols to compare greater than Trunk: + return +1 + else: + raise NotImplementedError() + + def __str__(self): + return self.name + + def __repr__(self): + return '%s<%x>' % (self, self.id,) + + +class TypedSymbol(Symbol): + """A Symbol whose type (branch, tag, or excluded) has been decided.""" + + def __init__(self, symbol): + Symbol.__init__( + self, symbol.id, symbol.project, symbol.name, + symbol.preferred_parent_id, + ) + + +class IncludedSymbol(TypedSymbol, LineOfDevelopment): + """A TypedSymbol that will be included in the conversion.""" + + def __init__(self, symbol): + TypedSymbol.__init__(self, symbol) + # We can't call the LineOfDevelopment constructor, so initialize + # its extra member explicitly: + try: + # If the old symbol had a base_path set, then use it: + self.base_path = symbol.base_path + except AttributeError: + self.base_path = None + + def __getstate__(self): + return (TypedSymbol.__getstate__(self), self.base_path,) + + def __setstate__(self, state): + (super_state, self.base_path,) = state + TypedSymbol.__setstate__(self, super_state) + + +class Branch(IncludedSymbol): + """An object that describes a CVS branch.""" + + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return 'Branch(%r)' % (self.name,) + + +class Tag(IncludedSymbol): + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return 'Tag(%r)' % (self.name,) + + +class ExcludedSymbol(TypedSymbol): + def __str__(self): + """For convenience only. The format is subject to change at any time.""" + + return 'ExcludedSymbol(%r)' % (self.name,) + + diff --git a/cvs2svn_lib/symbol_database.py b/cvs2svn_lib/symbol_database.py new file mode 100644 index 0000000..824f97b --- /dev/null +++ b/cvs2svn_lib/symbol_database.py @@ -0,0 +1,68 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains the SymbolDatabase class.""" + + +import cPickle + +from cvs2svn_lib import config +from cvs2svn_lib.artifact_manager import artifact_manager + + +class SymbolDatabase: + """Read-only access to symbol database. + + This class allows iteration and lookups id -> symbol, where symbol + is a TypedSymbol instance. The whole database is read into memory + upon construction.""" + + def __init__(self): + # A map { id : TypedSymbol } + self._symbols = {} + + f = open(artifact_manager.get_temp_file(config.SYMBOL_DB), 'rb') + symbols = cPickle.load(f) + f.close() + for symbol in symbols: + self._symbols[symbol.id] = symbol + + def get_symbol(self, id): + """Return the symbol instance with id ID. + + Raise KeyError if the symbol is not known.""" + + return self._symbols[id] + + def __iter__(self): + """Iterate over the Symbol instances within this database.""" + + return self._symbols.itervalues() + + def close(self): + self._symbols = None + + +def create_symbol_database(symbols): + """Create and fill a symbol database. + + Record each symbol that is listed in SYMBOLS, which is an iterable + containing Trunk and TypedSymbol objects.""" + + f = open(artifact_manager.get_temp_file(config.SYMBOL_DB), 'wb') + cPickle.dump(symbols, f, -1) + f.close() + diff --git a/cvs2svn_lib/symbol_statistics.py b/cvs2svn_lib/symbol_statistics.py new file mode 100644 index 0000000..0d35a50 --- /dev/null +++ b/cvs2svn_lib/symbol_statistics.py @@ -0,0 +1,521 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module gathers and processes statistics about lines of development.""" + +import cPickle + +from cvs2svn_lib import config +from cvs2svn_lib.common import error_prefix +from cvs2svn_lib.common import FatalException +from cvs2svn_lib.log import Log +from cvs2svn_lib.artifact_manager import artifact_manager +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.symbol import IncludedSymbol +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag +from cvs2svn_lib.symbol import ExcludedSymbol + + +class SymbolPlanError(FatalException): + pass + + +class SymbolPlanException(SymbolPlanError): + def __init__(self, stats, symbol, msg): + self.stats = stats + self.symbol = symbol + SymbolPlanError.__init__( + self, + 'Cannot convert the following symbol to %s: %s\n %s' + % (symbol, msg, self.stats,) + ) + + +class IndeterminateSymbolException(SymbolPlanException): + def __init__(self, stats, symbol): + SymbolPlanException.__init__(self, stats, symbol, 'Indeterminate type') + + +class _Stats: + """A summary of information about a symbol (tag or branch). + + Members: + + lod -- the LineOfDevelopment instance of the lod being described + + tag_create_count -- the number of files in which this lod appears + as a tag + + branch_create_count -- the number of files in which this lod + appears as a branch + + branch_commit_count -- the number of files in which there were + commits on this lod + + trivial_import_count -- the number of files in which this branch + was purely a non-trunk default branch containing exactly one + revision. + + pure_ntdb_count -- the number of files in which this branch was + purely a non-trunk default branch (consisting only of + non-trunk default branch revisions). + + branch_blockers -- a set of Symbol instances for any symbols that + sprout from a branch with this name. + + possible_parents -- a map {LineOfDevelopment : count} indicating + in how many files each LOD could have served as the parent of + self.lod.""" + + def __init__(self, lod): + self.lod = lod + self.tag_create_count = 0 + self.branch_create_count = 0 + self.branch_commit_count = 0 + self.branch_blockers = set() + self.trivial_import_count = 0 + self.pure_ntdb_count = 0 + self.possible_parents = { } + + def register_tag_creation(self): + """Register the creation of this lod as a tag.""" + + self.tag_create_count += 1 + + def register_branch_creation(self): + """Register the creation of this lod as a branch.""" + + self.branch_create_count += 1 + + def register_branch_commit(self): + """Register that there were commit(s) on this branch in one file.""" + + self.branch_commit_count += 1 + + def register_branch_blocker(self, blocker): + """Register BLOCKER as preventing this symbol from being deleted. + + BLOCKER is a tag or a branch that springs from a revision on this + symbol.""" + + self.branch_blockers.add(blocker) + + def register_trivial_import(self): + """Register that this branch is a trivial import branch in one file.""" + + self.trivial_import_count += 1 + + def register_pure_ntdb(self): + """Register that this branch is a pure import branch in one file.""" + + self.pure_ntdb_count += 1 + + def register_possible_parent(self, lod): + """Register that LOD was a possible parent for SELF.lod in a file.""" + + self.possible_parents[lod] = self.possible_parents.get(lod, 0) + 1 + + def register_branch_possible_parents(self, cvs_branch, cvs_file_items): + """Register any possible parents of this symbol from CVS_BRANCH.""" + + # This routine is a bottleneck. So we define some local variables + # to speed up access to frequently-needed variables. + register = self.register_possible_parent + parent_cvs_rev = cvs_file_items[cvs_branch.source_id] + + # The "obvious" parent of a branch is the branch holding the + # revision where the branch is rooted: + register(parent_cvs_rev.lod) + + # Any other branches that are rooted at the same revision and + # were committed earlier than the branch are also possible + # parents: + symbol = cvs_branch.symbol + for branch_id in parent_cvs_rev.branch_ids: + parent_symbol = cvs_file_items[branch_id].symbol + # A branch cannot be its own parent, nor can a branch's + # parent be a branch that was created after it. So we stop + # iterating when we reached the branch whose parents we are + # collecting: + if parent_symbol == symbol: + break + register(parent_symbol) + + def register_tag_possible_parents(self, cvs_tag, cvs_file_items): + """Register any possible parents of this symbol from CVS_TAG.""" + + # This routine is a bottleneck. So use local variables to speed + # up access to frequently-needed objects. + register = self.register_possible_parent + parent_cvs_rev = cvs_file_items[cvs_tag.source_id] + + # The "obvious" parent of a tag is the branch holding the + # revision where the branch is rooted: + register(parent_cvs_rev.lod) + + # Branches that are rooted at the same revision are also + # possible parents: + for branch_id in parent_cvs_rev.branch_ids: + parent_symbol = cvs_file_items[branch_id].symbol + register(parent_symbol) + + def is_ghost(self): + """Return True iff this lod never really existed.""" + + return ( + not isinstance(self.lod, Trunk) + and self.branch_commit_count == 0 + and not self.branch_blockers + and not self.possible_parents + ) + + def check_valid(self, symbol): + """Check whether SYMBOL is a valid conversion of SELF.lod. + + It is planned to convert SELF.lod as SYMBOL. Verify that SYMBOL + is a TypedSymbol and that the information that it contains is + consistent with that stored in SELF.lod. (This routine does not + do higher-level tests of whether the chosen conversion is actually + sensible.) If there are any problems, raise a + SymbolPlanException.""" + + if not isinstance(symbol, (Trunk, Branch, Tag, ExcludedSymbol)): + raise IndeterminateSymbolException(self, symbol) + + if symbol.id != self.lod.id: + raise SymbolPlanException(self, symbol, 'IDs must match') + + if symbol.project != self.lod.project: + raise SymbolPlanException(self, symbol, 'Projects must match') + + if isinstance(symbol, IncludedSymbol) and symbol.name != self.lod.name: + raise SymbolPlanException(self, symbol, 'Names must match') + + def check_preferred_parent_allowed(self, symbol): + """Check that SYMBOL's preferred_parent_id is an allowed parent. + + SYMBOL is the planned conversion of SELF.lod. Verify that its + preferred_parent_id is a possible parent of SELF.lod. If not, + raise a SymbolPlanException describing the problem.""" + + if isinstance(symbol, IncludedSymbol) \ + and symbol.preferred_parent_id is not None: + for pp in self.possible_parents.keys(): + if pp.id == symbol.preferred_parent_id: + return + else: + raise SymbolPlanException( + self, symbol, + 'The selected parent is not among the symbol\'s ' + 'possible parents.' + ) + + def __str__(self): + return ( + '\'%s\' is ' + 'a tag in %d files, ' + 'a branch in %d files, ' + 'a trivial import in %d files, ' + 'a pure import in %d files, ' + 'and has commits in %d files' + % (self.lod, self.tag_create_count, self.branch_create_count, + self.trivial_import_count, self.pure_ntdb_count, + self.branch_commit_count) + ) + + def __repr__(self): + retval = ['%s\n possible parents:\n' % (self,)] + parent_counts = self.possible_parents.items() + parent_counts.sort(lambda a,b: - cmp(a[1], b[1])) + for (symbol, count) in parent_counts: + if isinstance(symbol, Trunk): + retval.append(' trunk : %d\n' % count) + else: + retval.append(' \'%s\' : %d\n' % (symbol.name, count)) + if self.branch_blockers: + blockers = list(self.branch_blockers) + blockers.sort() + retval.append(' blockers:\n') + for blocker in blockers: + retval.append(' \'%s\'\n' % (blocker,)) + return ''.join(retval) + + +class SymbolStatisticsCollector: + """Collect statistics about lines of development. + + Record a summary of information about each line of development in + the RCS files for later storage into a database. The database is + created in CollectRevsPass and it is used in CollateSymbolsPass (via + the SymbolStatistics class). + + collect_data._SymbolDataCollector inserts information into instances + of this class by by calling its register_*() methods. + + Its main purpose is to assist in the decisions about which symbols + can be treated as branches and tags and which may be excluded. + + The data collected by this class can be written to the file + config.SYMBOL_STATISTICS.""" + + def __init__(self): + # A map { lod -> _Stats } for all lines of development: + self._stats = { } + + def __getitem__(self, lod): + """Return the _Stats record for line of development LOD. + + Create and register a new one if necessary.""" + + try: + return self._stats[lod] + except KeyError: + stats = _Stats(lod) + self._stats[lod] = stats + return stats + + def register(self, cvs_file_items): + """Register the statistics for each symbol in CVS_FILE_ITEMS.""" + + for lod_items in cvs_file_items.iter_lods(): + if lod_items.lod is not None: + branch_stats = self[lod_items.lod] + + branch_stats.register_branch_creation() + + if lod_items.cvs_revisions: + branch_stats.register_branch_commit() + + if lod_items.is_trivial_import(): + branch_stats.register_trivial_import() + + if lod_items.is_pure_ntdb(): + branch_stats.register_pure_ntdb() + + for cvs_symbol in lod_items.iter_blockers(): + branch_stats.register_branch_blocker(cvs_symbol.symbol) + + if lod_items.cvs_branch is not None: + branch_stats.register_branch_possible_parents( + lod_items.cvs_branch, cvs_file_items + ) + + for cvs_tag in lod_items.cvs_tags: + tag_stats = self[cvs_tag.symbol] + + tag_stats.register_tag_creation() + + tag_stats.register_tag_possible_parents(cvs_tag, cvs_file_items) + + def purge_ghost_symbols(self): + """Purge any symbols that don't have any activity. + + Such ghost symbols can arise if a symbol was defined in an RCS + file but pointed at a non-existent revision.""" + + for stats in self._stats.values(): + if stats.is_ghost(): + Log().warn('Deleting ghost symbol: %s' % (stats.lod,)) + del self._stats[stats.lod] + + def close(self): + """Store the stats database to the SYMBOL_STATISTICS file.""" + + f = open(artifact_manager.get_temp_file(config.SYMBOL_STATISTICS), 'wb') + cPickle.dump(self._stats.values(), f, -1) + f.close() + self._stats = None + + +class SymbolStatistics: + """Read and handle line of development statistics. + + The statistics are read from a database created by + SymbolStatisticsCollector. This class has methods to process the + statistics information and help with decisions about: + + 1. What tags and branches should be processed/excluded + + 2. What tags should be forced to be branches and vice versa (this + class maintains some statistics to help the user decide) + + 3. Are there inconsistencies? + + - A symbol that is sometimes a branch and sometimes a tag + + - A forced branch with commit(s) on it + + - A non-excluded branch depends on an excluded branch + + The data in this class is read from a pickle file.""" + + def __init__(self, filename): + """Read the stats database from FILENAME.""" + + # A map { LineOfDevelopment -> _Stats } for all lines of + # development: + self._stats = { } + + # A map { LineOfDevelopment.id -> _Stats } for all lines of + # development: + self._stats_by_id = { } + + stats_list = cPickle.load(open(filename, 'rb')) + + for stats in stats_list: + self._stats[stats.lod] = stats + self._stats_by_id[stats.lod.id] = stats + + def __len__(self): + return len(self._stats) + + def __getitem__(self, lod_id): + return self._stats_by_id[lod_id] + + def get_stats(self, lod): + """Return the _Stats object for LineOfDevelopment instance LOD. + + Raise KeyError if no such lod exists.""" + + return self._stats[lod] + + def __iter__(self): + return self._stats.itervalues() + + def _check_blocked_excludes(self, symbol_map): + """Check for any excluded LODs that are blocked by non-excluded symbols. + + If any are found, describe the problem to Log().error() and raise + a FatalException.""" + + # A list of (lod,[blocker,...]) tuples for excludes that are + # blocked by the specified non-excluded blockers: + problems = [] + + for lod in symbol_map.itervalues(): + if isinstance(lod, ExcludedSymbol): + # Symbol is excluded; make sure that its blockers are also + # excluded: + lod_blockers = [] + for blocker in self.get_stats(lod).branch_blockers: + if isinstance(symbol_map.get(blocker, None), IncludedSymbol): + lod_blockers.append(blocker) + if lod_blockers: + problems.append((lod, lod_blockers)) + + if problems: + s = [] + for (lod, lod_blockers) in problems: + s.append( + '%s: %s cannot be excluded because the following symbols ' + 'depend on it:\n' + % (error_prefix, lod,) + ) + for blocker in lod_blockers: + s.append(' %s\n' % (blocker,)) + s.append('\n') + Log().error(''.join(s)) + + raise FatalException() + + def _check_invalid_tags(self, symbol_map): + """Check for commits on any symbols that are to be converted as tags. + + SYMBOL_MAP is a map {AbstractSymbol : (Trunk|TypedSymbol)} + indicating how each AbstractSymbol is to be converted. If there + is a commit on a symbol, then it cannot be converted as a tag. If + any tags with commits are found, output error messages describing + the problems then raise a FatalException.""" + + Log().quiet("Checking for forced tags with commits...") + + invalid_tags = [ ] + for symbol in symbol_map.itervalues(): + if isinstance(symbol, Tag): + stats = self.get_stats(symbol) + if stats.branch_commit_count > 0: + invalid_tags.append(symbol) + + if not invalid_tags: + # No problems found: + return + + s = [] + s.append( + '%s: The following branches cannot be forced to be tags ' + 'because they have commits:\n' + % (error_prefix,) + ) + for tag in invalid_tags: + s.append(' %s\n' % (tag.name)) + s.append('\n') + Log().error(''.join(s)) + + raise FatalException() + + def check_consistency(self, symbol_map): + """Check the plan for how to convert symbols for consistency. + + SYMBOL_MAP is a map {AbstractSymbol : (Trunk|TypedSymbol)} + indicating how each AbstractSymbol is to be converted. If any + problems are detected, describe the problem to Log().error() and + raise a FatalException.""" + + # We want to do all of the consistency checks even if one of them + # fails, so that the user gets as much feedback as possible. Set + # this variable to True if any errors are found. + error_found = False + + # Check that the planned preferred parents are OK for all + # IncludedSymbols: + for lod in symbol_map.itervalues(): + if isinstance(lod, IncludedSymbol): + stats = self.get_stats(lod) + try: + stats.check_preferred_parent_allowed(lod) + except SymbolPlanException, e: + Log().error('%s\n' % (e,)) + error_found = True + + try: + self._check_blocked_excludes(symbol_map) + except FatalException: + error_found = True + + try: + self._check_invalid_tags(symbol_map) + except FatalException: + error_found = True + + if error_found: + raise FatalException( + 'Please fix the above errors and restart CollateSymbolsPass' + ) + + def exclude_symbol(self, symbol): + """SYMBOL has been excluded; remove it from our statistics.""" + + del self._stats[symbol] + del self._stats_by_id[symbol.id] + + # Remove references to this symbol from other statistics objects: + for stats in self._stats.itervalues(): + stats.branch_blockers.discard(symbol) + if symbol in stats.possible_parents: + del stats.possible_parents[symbol] + + diff --git a/cvs2svn_lib/symbol_strategy.py b/cvs2svn_lib/symbol_strategy.py new file mode 100644 index 0000000..9d562a8 --- /dev/null +++ b/cvs2svn_lib/symbol_strategy.py @@ -0,0 +1,685 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2000-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""SymbolStrategy classes determine how to convert symbols.""" + +import re + +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import path_join +from cvs2svn_lib.common import normalize_svn_path +from cvs2svn_lib.log import Log +from cvs2svn_lib.symbol import Trunk +from cvs2svn_lib.symbol import TypedSymbol +from cvs2svn_lib.symbol import Branch +from cvs2svn_lib.symbol import Tag +from cvs2svn_lib.symbol import ExcludedSymbol +from cvs2svn_lib.symbol_statistics import SymbolPlanError + + +class StrategyRule: + """A single rule that might determine how to convert a symbol.""" + + def start(self, symbol_statistics): + """This method is called once before get_symbol() is ever called. + + The StrategyRule can override this method to do whatever it wants + to prepare itself for work. SYMBOL_STATISTICS is an instance of + SymbolStatistics containing the statistics for all symbols in all + projects.""" + + pass + + def get_symbol(self, symbol, stats): + """Return an object describing what to do with the symbol in STATS. + + SYMBOL holds a Trunk or Symbol object as it has been determined so + far. Hopefully one of these method calls will turn any naked + Symbol instances into TypedSymbols. + + If this rule applies to the SYMBOL (whose statistics are collected + in STATS), then return a new or modified AbstractSymbol object. + If this rule doesn't apply, return SYMBOL unchanged.""" + + raise NotImplementedError() + + def finish(self): + """This method is called once after get_symbol() is done being called. + + The StrategyRule can override this method do whatever it wants to + release resources, etc.""" + + pass + + +class _RegexpStrategyRule(StrategyRule): + """A Strategy rule that bases its decisions on regexp matches. + + If self.regexp matches a symbol name, return self.action(symbol); + otherwise, return the symbol unchanged.""" + + def __init__(self, pattern, action): + """Initialize a _RegexpStrategyRule. + + PATTERN is a string that will be treated as a regexp pattern. + PATTERN must match a full symbol name for the rule to apply (i.e., + it is anchored at the beginning and end of the symbol name). + + ACTION is the class representing how the symbol should be + converted. It should be one of the classes Branch, Tag, or + ExcludedSymbol. + + If PATTERN matches a symbol name, then get_symbol() returns + ACTION(name, id); otherwise it returns SYMBOL unchanged.""" + + try: + self.regexp = re.compile('^' + pattern + '$') + except re.error: + raise FatalError("%r is not a valid regexp." % (pattern,)) + + self.action = action + + def log(self, symbol): + raise NotImplementedError() + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + elif self.regexp.match(symbol.name): + self.log(symbol) + return self.action(symbol) + else: + return symbol + + +class ForceBranchRegexpStrategyRule(_RegexpStrategyRule): + """Force symbols matching pattern to be branches.""" + + def __init__(self, pattern): + _RegexpStrategyRule.__init__(self, pattern, Branch) + + def log(self, symbol): + Log().verbose( + 'Converting symbol %s as a branch because it matches regexp "%s".' + % (symbol, self.regexp.pattern,) + ) + + +class ForceTagRegexpStrategyRule(_RegexpStrategyRule): + """Force symbols matching pattern to be tags.""" + + def __init__(self, pattern): + _RegexpStrategyRule.__init__(self, pattern, Tag) + + def log(self, symbol): + Log().verbose( + 'Converting symbol %s as a tag because it matches regexp "%s".' + % (symbol, self.regexp.pattern,) + ) + + +class ExcludeRegexpStrategyRule(_RegexpStrategyRule): + """Exclude symbols matching pattern.""" + + def __init__(self, pattern): + _RegexpStrategyRule.__init__(self, pattern, ExcludedSymbol) + + def log(self, symbol): + Log().verbose( + 'Excluding symbol %s because it matches regexp "%s".' + % (symbol, self.regexp.pattern,) + ) + + +class ExcludeTrivialImportBranchRule(StrategyRule): + """If a symbol is a trivial import branch, exclude it. + + A trivial import branch is defined to be a branch that only had a + single import on it (no other kinds of commits) in every file in + which it appeared. In most cases these branches are worthless.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + if stats.tag_create_count == 0 \ + and stats.branch_create_count == stats.trivial_import_count: + Log().verbose( + 'Excluding branch %s because it is a trivial import branch.' + % (symbol,) + ) + return ExcludedSymbol(symbol) + else: + return symbol + + +class ExcludeVendorBranchRule(StrategyRule): + """If a symbol is a pure vendor branch, exclude it. + + A pure vendor branch is defined to be a branch that only had imports + on it (no other kinds of commits) in every file in which it + appeared.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + if stats.tag_create_count == 0 \ + and stats.branch_create_count == stats.pure_ntdb_count: + Log().verbose( + 'Excluding branch %s because it is a pure vendor branch.' + % (symbol,) + ) + return ExcludedSymbol(symbol) + else: + return symbol + + +class UnambiguousUsageRule(StrategyRule): + """If a symbol is used unambiguously as a tag/branch, convert it as such.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + is_tag = stats.tag_create_count > 0 + is_branch = stats.branch_create_count > 0 or stats.branch_commit_count > 0 + if is_tag and is_branch: + # Can't decide + return symbol + elif is_branch: + Log().verbose( + 'Converting symbol %s as a branch because it is always used ' + 'as a branch.' + % (symbol,) + ) + return Branch(symbol) + elif is_tag: + Log().verbose( + 'Converting symbol %s as a tag because it is always used ' + 'as a tag.' + % (symbol,) + ) + return Tag(symbol) + else: + # The symbol didn't appear at all: + return symbol + + +class BranchIfCommitsRule(StrategyRule): + """If there was ever a commit on the symbol, convert it as a branch.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + elif stats.branch_commit_count > 0: + Log().verbose( + 'Converting symbol %s as a branch because there are commits on it.' + % (symbol,) + ) + return Branch(symbol) + else: + return symbol + + +class HeuristicStrategyRule(StrategyRule): + """Convert symbol based on how often it was used as a branch/tag. + + Whichever happened more often determines how the symbol is + converted.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + elif stats.tag_create_count >= stats.branch_create_count: + Log().verbose( + 'Converting symbol %s as a tag because it is more often used ' + 'as a tag.' + % (symbol,) + ) + return Tag(symbol) + else: + Log().verbose( + 'Converting symbol %s as a branch because it is more often used ' + 'as a branch.' + % (symbol,) + ) + return Branch(symbol) + + +class AllBranchRule(StrategyRule): + """Convert all symbols as branches. + + Usually this rule will appear after a list of more careful rules + (including a general rule like UnambiguousUsageRule) and will + therefore only apply to the symbols not handled earlier.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + else: + Log().verbose( + 'Converting symbol %s as a branch because no other rules applied.' + % (symbol,) + ) + return Branch(symbol) + + +class AllTagRule(StrategyRule): + """Convert all symbols as tags. + + We don't worry about conflicts here; they will be caught later by + SymbolStatistics.check_consistency(). + + Usually this rule will appear after a list of more careful rules + (including a general rule like UnambiguousUsageRule) and will + therefore only apply to the symbols not handled earlier.""" + + def get_symbol(self, symbol, stats): + if isinstance(symbol, (Trunk, TypedSymbol)): + return symbol + else: + Log().verbose( + 'Converting symbol %s as a tag because no other rules applied.' + % (symbol,) + ) + return Tag(symbol) + + +class TrunkPathRule(StrategyRule): + """Set the base path for Trunk.""" + + def __init__(self, trunk_path): + self.trunk_path = trunk_path + + def get_symbol(self, symbol, stats): + if isinstance(symbol, Trunk) and symbol.base_path is None: + symbol.base_path = self.trunk_path + + return symbol + + +class SymbolPathRule(StrategyRule): + """Set the base paths for symbol LODs.""" + + def __init__(self, symbol_type, base_path): + self.symbol_type = symbol_type + self.base_path = base_path + + def get_symbol(self, symbol, stats): + if isinstance(symbol, self.symbol_type) and symbol.base_path is None: + symbol.base_path = path_join(self.base_path, symbol.name) + + return symbol + + +class BranchesPathRule(SymbolPathRule): + """Set the base paths for Branch LODs.""" + + def __init__(self, branch_path): + SymbolPathRule.__init__(self, Branch, branch_path) + + +class TagsPathRule(SymbolPathRule): + """Set the base paths for Tag LODs.""" + + def __init__(self, tag_path): + SymbolPathRule.__init__(self, Tag, tag_path) + + +class HeuristicPreferredParentRule(StrategyRule): + """Use a heuristic rule to pick preferred parents. + + Pick the parent that should be preferred for any TypedSymbols. As + parent, use the symbol that appeared most often as a possible parent + of the symbol in question. If multiple symbols are tied, choose the + one that comes first according to the Symbol class's natural sort + order.""" + + def _get_preferred_parent(self, stats): + """Return the LODs that are most often possible parents in STATS. + + Return the set of LinesOfDevelopment that appeared most often as + possible parents. The return value might contain multiple symbols + if multiple LinesOfDevelopment appeared the same number of times.""" + + best_count = -1 + best_symbol = None + for (symbol, count) in stats.possible_parents.items(): + if count > best_count or (count == best_count and symbol < best_symbol): + best_count = count + best_symbol = symbol + + if best_symbol is None: + return None + else: + return best_symbol + + def get_symbol(self, symbol, stats): + if isinstance(symbol, TypedSymbol) and symbol.preferred_parent_id is None: + preferred_parent = self._get_preferred_parent(stats) + if preferred_parent is None: + Log().verbose('%s has no preferred parent' % (symbol,)) + else: + symbol.preferred_parent_id = preferred_parent.id + Log().verbose( + 'The preferred parent of %s is %s' % (symbol, preferred_parent,) + ) + + return symbol + + +class ManualTrunkRule(StrategyRule): + """Change the SVN path of Trunk LODs. + + Members: + + project_id -- (int or None) The id of the project whose trunk + should be affected by this rule. If project_id is None, then + the rule is not project-specific. + + svn_path -- (str) The SVN path that should be used as the base + directory for this trunk. This member must not be None, + though it may be the empty string for a single-project, + trunk-only conversion. + + """ + + def __init__(self, project_id, svn_path): + self.project_id = project_id + self.svn_path = normalize_svn_path(svn_path, allow_empty=True) + + def get_symbol(self, symbol, stats): + if (self.project_id is not None + and self.project_id != stats.lod.project.id): + return symbol + + if isinstance(symbol, Trunk): + symbol.base_path = self.svn_path + + return symbol + + +def convert_as_branch(symbol): + Log().verbose( + 'Converting symbol %s as a branch because of manual setting.' + % (symbol,) + ) + return Branch(symbol) + + +def convert_as_tag(symbol): + Log().verbose( + 'Converting symbol %s as a tag because of manual setting.' + % (symbol,) + ) + return Tag(symbol) + + +def exclude(symbol): + Log().verbose( + 'Excluding symbol %s because of manual setting.' + % (symbol,) + ) + return ExcludedSymbol(symbol) + + +class ManualSymbolRule(StrategyRule): + """Change how particular symbols are converted. + + Members: + + project_id -- (int or None) The id of the project whose trunk + should be affected by this rule. If project_id is None, then + the rule is not project-specific. + + symbol_name -- (str) The name of the symbol that should be + affected by this rule. + + conversion -- (callable or None) A callable that converts the + symbol to its preferred output type. This should normally be + one of (convert_as_branch, convert_as_tag, exclude). If this + member is None, then this rule does not affect the symbol's + output type. + + svn_path -- (str) The SVN path that should be used as the base + directory for this trunk. This member must not be None, + though it may be the empty string for a single-project, + trunk-only conversion. + + parent_lod_name -- (str or None) The name of the line of + development that should be preferred as the parent of this + symbol. (The preferred parent is the line of development from + which the symbol should sprout.) If this member is set to the + string '.trunk.', then the symbol will be set to sprout + directly from trunk. If this member is set to None, then this + rule won't affect the symbol's parent. + + """ + + def __init__( + self, project_id, symbol_name, conversion, svn_path, parent_lod_name + ): + self.project_id = project_id + self.symbol_name = symbol_name + self.conversion = conversion + if svn_path is None: + self.svn_path = None + else: + self.svn_path = normalize_svn_path(svn_path, allow_empty=True) + self.parent_lod_name = parent_lod_name + + def _get_parent_by_id(self, parent_lod_name, stats): + """Return the LOD object for the parent with name PARENT_LOD_NAME. + + STATS is the _Stats object describing a symbol whose parent needs + to be determined from its name. If none of its possible parents + has name PARENT_LOD_NAME, raise a SymbolPlanError.""" + + for pp in stats.possible_parents.keys(): + if isinstance(pp, Trunk): + pass + elif pp.name == parent_lod_name: + return pp + else: + parent_counts = stats.possible_parents.items() + parent_counts.sort(lambda a,b: - cmp(a[1], b[1])) + lines = [ + '%s is not a valid parent for %s;' + % (parent_lod_name, stats.lod,), + ' possible parents (with counts):' + ] + for (symbol, count) in parent_counts: + if isinstance(symbol, Trunk): + lines.append(' .trunk. : %d' % count) + else: + lines.append(' %s : %d' % (symbol.name, count)) + raise SymbolPlanError('\n'.join(lines)) + + def get_symbol(self, symbol, stats): + if (self.project_id is not None + and self.project_id != stats.lod.project.id): + return symbol + + elif isinstance(symbol, Trunk): + return symbol + + elif self.symbol_name == stats.lod.name: + if self.conversion is not None: + symbol = self.conversion(symbol) + + if self.parent_lod_name is None: + pass + elif self.parent_lod_name == '.trunk.': + symbol.preferred_parent_id = stats.lod.project.trunk_id + else: + symbol.preferred_parent_id = self._get_parent_by_id( + self.parent_lod_name, stats + ).id + + if self.svn_path is not None: + symbol.base_path = self.svn_path + + return symbol + + +class SymbolHintsFileRule(StrategyRule): + """Use manual symbol configurations read from a file. + + The input file is line-oriented with the following format: + + [ []] + + Where the fields are separated by whitespace and + + project-id -- the numerical id of the Project to which the + symbol belongs (numbered starting with 0). This field can + be '.' if the rule is not project-specific. + + symbol-name -- the name of the symbol being specified, or + '.trunk.' if the rule should apply to trunk. + + conversion -- how the symbol should be treated in the + conversion. This is one of the following values: 'branch', + 'tag', or 'exclude'. This field can be '.' if the rule + shouldn't affect how the symbol is treated in the + conversion. + + svn-path -- the SVN path that should serve as the root path of + this LOD. The path should be expressed as a path relative + to the SVN root directory, with or without a leading '/'. + This field can be omitted or '.' if the rule shouldn't + affect the LOD's SVN path. + + parent-lod-name -- the name of the LOD that should serve as this + symbol's parent. This field can be omitted or '.' if the + rule shouldn't affect the symbol's parent, or it can be + '.trunk.' to indicate that the symbol should sprout from the + project's trunk.""" + + comment_re = re.compile(r'^(\#|$)') + + conversion_map = { + 'branch' : convert_as_branch, + 'tag' : convert_as_tag, + 'exclude' : exclude, + '.' : None, + } + + def __init__(self, filename): + self.filename = filename + + def start(self, symbol_statistics): + self._rules = [] + + f = open(self.filename, 'r') + for l in f: + l = l.rstrip() + s = l.lstrip() + if self.comment_re.match(s): + continue + fields = s.split() + + if len(fields) < 3: + raise FatalError( + 'The following line in "%s" cannot be parsed:\n "%s"' + % (self.filename, l,) + ) + + project_id = fields.pop(0) + symbol_name = fields.pop(0) + conversion = fields.pop(0) + + if fields: + svn_path = fields.pop(0) + if svn_path == '.': + svn_path = None + elif svn_path[0] == '/': + svn_path = svn_path[1:] + else: + svn_path = None + + if fields: + parent_lod_name = fields.pop(0) + else: + parent_lod_name = '.' + + if fields: + raise FatalError( + 'The following line in "%s" cannot be parsed:\n "%s"' + % (self.filename, l,) + ) + + if project_id == '.': + project_id = None + else: + try: + project_id = int(project_id) + except ValueError: + raise FatalError( + 'Illegal project_id in the following line:\n "%s"' % (l,) + ) + + if symbol_name == '.trunk.': + if conversion not in ['.', 'trunk']: + raise FatalError('Trunk cannot be converted as a different type') + + if parent_lod_name != '.': + raise FatalError('Trunk\'s parent cannot be set') + + if svn_path is None: + # This rule doesn't do anything: + pass + else: + self._rules.append(ManualTrunkRule(project_id, svn_path)) + + else: + try: + conversion = self.conversion_map[conversion] + except KeyError: + raise FatalError( + 'Illegal conversion in the following line:\n "%s"' % (l,) + ) + + if parent_lod_name == '.': + parent_lod_name = None + + if conversion is None \ + and svn_path is None \ + and parent_lod_name is None: + # There is nothing to be done: + pass + else: + self._rules.append( + ManualSymbolRule( + project_id, symbol_name, + conversion, svn_path, parent_lod_name + ) + ) + + for rule in self._rules: + rule.start(symbol_statistics) + + def get_symbol(self, symbol, stats): + for rule in self._rules: + symbol = rule.get_symbol(symbol, stats) + + return symbol + + def finish(self): + for rule in self._rules: + rule.finish() + + del self._rules + + diff --git a/cvs2svn_lib/symbol_transform.py b/cvs2svn_lib/symbol_transform.py new file mode 100644 index 0000000..a4995b8 --- /dev/null +++ b/cvs2svn_lib/symbol_transform.py @@ -0,0 +1,236 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains classes to transform symbol names.""" + + +import os +import re + +from cvs2svn_lib.log import Log +from cvs2svn_lib.common import FatalError +from cvs2svn_lib.common import IllegalSVNPathError +from cvs2svn_lib.common import normalize_svn_path + + +class SymbolTransform: + """Transform symbol names arbitrarily.""" + + def transform(self, cvs_file, symbol_name, revision): + """Possibly transform SYMBOL_NAME, which was found in CVS_FILE. + + Return the transformed symbol name. If this SymbolTransform + doesn't apply, return the original SYMBOL_NAME. If this symbol + should be ignored entirely, return None. (Please note that + ignoring a branch via this mechanism only causes the branch *name* + to be ignored; the branch contents will still be converted. + Usually branches should be excluded using --exclude.) + + REVISION contains the CVS revision number to which the symbol was + attached in the file as a string (with zeros removed). + + This method is free to use the information in CVS_FILE (including + CVS_FILE.project) to decide whether and/or how to transform + SYMBOL_NAME.""" + + raise NotImplementedError() + + +class ReplaceSubstringsSymbolTransform(SymbolTransform): + """Replace specific substrings in symbol names. + + If the substring occurs multiple times, replace all copies.""" + + def __init__(self, old, new): + self.old = old + self.new = new + + def transform(self, cvs_file, symbol_name, revision): + return symbol_name.replace(self.old, self.new) + + +class NormalizePathsSymbolTransform(SymbolTransform): + def transform(self, cvs_file, symbol_name, revision): + try: + return normalize_svn_path(symbol_name) + except IllegalSVNPathError, e: + raise FatalError('Problem with %s: %s' % (symbol_name, e,)) + + +class CompoundSymbolTransform(SymbolTransform): + """A SymbolTransform that applies other SymbolTransforms in series. + + Each of the contained SymbolTransforms is applied, one after the + other. If any of them returns None, then None is returned (the + following SymbolTransforms are ignored).""" + + def __init__(self, symbol_transforms): + """Ininitialize a CompoundSymbolTransform. + + SYMBOL_TRANSFORMS is an iterable of SymbolTransform instances.""" + + self.symbol_transforms = list(symbol_transforms) + + def transform(self, cvs_file, symbol_name, revision): + for symbol_transform in self.symbol_transforms: + symbol_name = symbol_transform.transform( + cvs_file, symbol_name, revision + ) + if symbol_name is None: + # Don't continue with other symbol transforms: + break + + return symbol_name + + +class RegexpSymbolTransform(SymbolTransform): + """Transform symbols by using a regexp textual substitution.""" + + def __init__(self, pattern, replacement): + """Create a SymbolTransform that transforms symbols matching PATTERN. + + PATTERN is a regular expression that should match the whole symbol + name. REPLACEMENT is the replacement text, which may include + patterns like r'\1' or r'\g<1>' or r'\g' (where 'name' is a + reference to a named substring in the pattern of the form + r'(?P...)').""" + + self.pattern = re.compile('^' + pattern + '$') + self.replacement = replacement + + def transform(self, cvs_file, symbol_name, revision): + return self.pattern.sub(self.replacement, symbol_name) + + +class SymbolMapper(SymbolTransform): + """A SymbolTransform that transforms specific symbol definitions. + + The user has to specify the exact CVS filename, symbol name, and + revision number to be transformed, and the new name (or None if the + symbol should be ignored). The mappings can be set via a + constructor argument or by calling __setitem__().""" + + def __init__(self, items=[]): + """Initialize the mapper. + + ITEMS is a list of tuples (cvs_filename, symbol_name, revision, + new_name) which will be set as mappings.""" + + # A map {(cvs_filename, symbol_name, revision) : new_name}: + self._map = {} + + for (cvs_filename, symbol_name, revision, new_name) in items: + self[cvs_filename, symbol_name, revision] = new_name + + def __setitem__(self, (cvs_filename, symbol_name, revision), new_name): + """Set a mapping for a particular file, symbol, and revision.""" + + cvs_filename = os.path.normcase(os.path.normpath(cvs_filename)) + key = (cvs_filename, symbol_name, revision) + if key in self._map: + Log().warn( + 'Overwriting symbol transform for\n' + ' filename=%r symbol=%s revision=%s' + % (cvs_filename, symbol_name, revision,) + ) + self._map[key] = new_name + + def transform(self, cvs_file, symbol_name, revision): + cvs_filename = os.path.normcase(os.path.normpath(cvs_file.filename)) + return self._map.get( + (cvs_filename, symbol_name, revision), symbol_name + ) + + +class SubtreeSymbolMapper(SymbolTransform): + """A SymbolTransform that transforms symbols within a whole repo subtree. + + The user has to specify a CVS repository path (a filename or + directory) and the original symbol name. All symbols under that + path will be renamed to the specified new name (which can be None if + the symbol should be ignored). The mappings can be set via a + constructor argument or by calling __setitem__(). Only the most + specific rule is applied.""" + + def __init__(self, items=[]): + """Initialize the mapper. + + ITEMS is a list of tuples (cvs_path, symbol_name, new_name) + which will be set as mappings. cvs_path is a string naming a + directory within the CVS repository.""" + + # A map {symbol_name : {cvs_path : new_name}}: + self._map = {} + + for (cvs_path, symbol_name, new_name) in items: + self[cvs_path, symbol_name] = new_name + + def __setitem__(self, (cvs_path, symbol_name), new_name): + """Set a mapping for a particular file and symbol.""" + + try: + symbol_map = self._map[symbol_name] + except KeyError: + symbol_map = {} + self._map[symbol_name] = symbol_map + + cvs_path = os.path.normcase(os.path.normpath(cvs_path)) + if cvs_path in symbol_map: + Log().warn( + 'Overwriting symbol transform for\n' + ' directory=%r symbol=%s' + % (cvs_path, symbol_name,) + ) + symbol_map[cvs_path] = new_name + + def transform(self, cvs_file, symbol_name, revision): + try: + symbol_map = self._map[symbol_name] + except KeyError: + # No rules for that symbol name + return symbol_name + + cvs_path = os.path.normcase(os.path.normpath(cvs_file.filename)) + while True: + try: + return symbol_map[cvs_path] + except KeyError: + new_cvs_path = os.path.dirname(cvs_path) + if new_cvs_path == cvs_path: + # No rules found for that path; return symbol name unaltered. + return symbol_name + else: + cvs_path = new_cvs_path + + +class IgnoreSymbolTransform(SymbolTransform): + """Ignore symbols matching a specified regular expression.""" + + def __init__(self, pattern): + """Create an SymbolTransform that ignores symbols matching PATTERN. + + PATTERN is a regular expression that should match the whole symbol + name.""" + + self.pattern = re.compile('^' + pattern + '$') + + def transform(self, cvs_file, symbol_name, revision): + if self.pattern.match(symbol_name): + return None + else: + return symbol_name + + diff --git a/cvs2svn_lib/time_range.py b/cvs2svn_lib/time_range.py new file mode 100644 index 0000000..f7dc234 --- /dev/null +++ b/cvs2svn_lib/time_range.py @@ -0,0 +1,44 @@ +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2008 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""This module contains a class to manage time ranges.""" + + +class TimeRange(object): + __slots__ = ('t_min', 't_max') + + def __init__(self): + # Start out with a t_min higher than any incoming time T, and a + # t_max lower than any incoming T. This way the first T will push + # t_min down to T, and t_max up to T, naturally (without any + # special-casing), and successive times will then ratchet them + # outward as appropriate. + self.t_min = 1L<<32 + self.t_max = 0 + + def add(self, timestamp): + """Expand the range to encompass TIMESTAMP.""" + + if timestamp < self.t_min: + self.t_min = timestamp + if timestamp > self.t_max: + self.t_max = timestamp + + def __cmp__(self, other): + # Sorted by t_max, and break ties using t_min. + return cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min) + + diff --git a/cvs2svn_lib/version.py b/cvs2svn_lib/version.py new file mode 100644 index 0000000..7900964 --- /dev/null +++ b/cvs2svn_lib/version.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python2 +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007-2009 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +# The version of cvs2svn: +VERSION = '2.3.0' + + +# If this file is run as a script, print the cvs2svn version number to +# stdout: +if __name__ == '__main__': + print VERSION + + diff --git a/cvs2svn_rcsparse/__init__.py b/cvs2svn_rcsparse/__init__.py new file mode 100644 index 0000000..829c117 --- /dev/null +++ b/cvs2svn_rcsparse/__init__.py @@ -0,0 +1,26 @@ +# -*-python-*- +# +# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. +# +# By using this file, you agree to the terms and conditions set forth in +# the LICENSE.html file which can be found at the top level of the ViewVC +# distribution or at http://viewvc.org/license-1.html. +# +# For more information, visit http://viewvc.org/ +# +# ----------------------------------------------------------------------- + +"""This package provides parsing tools for RCS files.""" + +from common import * + +try: + from tparse import parse +except ImportError: + try: + from texttools import Parser + except ImportError: + from default import Parser + + def parse(file, sink): + return Parser().parse(file, sink) diff --git a/cvs2svn_rcsparse/common.py b/cvs2svn_rcsparse/common.py new file mode 100644 index 0000000..3eed600 --- /dev/null +++ b/cvs2svn_rcsparse/common.py @@ -0,0 +1,324 @@ +# -*-python-*- +# +# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. +# +# By using this file, you agree to the terms and conditions set forth in +# the LICENSE.html file which can be found at the top level of the ViewVC +# distribution or at http://viewvc.org/license-1.html. +# +# For more information, visit http://viewvc.org/ +# +# ----------------------------------------------------------------------- + +"""common.py: common classes and functions for the RCS parsing tools.""" + +import calendar +import string + +class Sink: + def set_head_revision(self, revision): + pass + + def set_principal_branch(self, branch_name): + pass + + def set_access(self, accessors): + pass + + def define_tag(self, name, revision): + pass + + def set_locker(self, revision, locker): + pass + + def set_locking(self, mode): + """Used to signal locking mode. + + Called with mode argument 'strict' if strict locking + Not called when no locking used.""" + + pass + + def set_comment(self, comment): + pass + + def set_expansion(self, mode): + pass + + def admin_completed(self): + pass + + def define_revision(self, revision, timestamp, author, state, + branches, next): + pass + + def tree_completed(self): + pass + + def set_description(self, description): + pass + + def set_revision_info(self, revision, log, text): + pass + + def parse_completed(self): + pass + + +# -------------------------------------------------------------------------- +# +# EXCEPTIONS USED BY RCSPARSE +# + +class RCSParseError(Exception): + pass + + +class RCSIllegalCharacter(RCSParseError): + pass + + +class RCSExpected(RCSParseError): + def __init__(self, got, wanted): + RCSParseError.__init__( + self, + 'Unexpected parsing error in RCS file.\n' + 'Expected token: %s, but saw: %s' + % (wanted, got) + ) + + +class RCSStopParser(Exception): + pass + + +# -------------------------------------------------------------------------- +# +# STANDARD TOKEN STREAM-BASED PARSER +# + +class _Parser: + stream_class = None # subclasses need to define this + + def _read_until_semicolon(self): + """Read all tokens up to and including the next semicolon token. + + Return the tokens (not including the semicolon) as a list.""" + + tokens = [] + + while 1: + token = self.ts.get() + if token == ';': + break + tokens.append(token) + + return tokens + + def _parse_admin_head(self, token): + rev = self.ts.get() + if rev == ';': + # The head revision is not specified. Just drop the semicolon + # on the floor. + pass + else: + self.sink.set_head_revision(rev) + self.ts.match(';') + + def _parse_admin_branch(self, token): + branch = self.ts.get() + if branch != ';': + self.sink.set_principal_branch(branch) + self.ts.match(';') + + def _parse_admin_access(self, token): + accessors = self._read_until_semicolon() + if accessors: + self.sink.set_access(accessors) + + def _parse_admin_symbols(self, token): + while 1: + tag_name = self.ts.get() + if tag_name == ';': + break + self.ts.match(':') + tag_rev = self.ts.get() + self.sink.define_tag(tag_name, tag_rev) + + def _parse_admin_locks(self, token): + while 1: + locker = self.ts.get() + if locker == ';': + break + self.ts.match(':') + rev = self.ts.get() + self.sink.set_locker(rev, locker) + + def _parse_admin_strict(self, token): + self.sink.set_locking("strict") + self.ts.match(';') + + def _parse_admin_comment(self, token): + self.sink.set_comment(self.ts.get()) + self.ts.match(';') + + def _parse_admin_expand(self, token): + expand_mode = self.ts.get() + self.sink.set_expansion(expand_mode) + self.ts.match(';') + + admin_token_map = { + 'head' : _parse_admin_head, + 'branch' : _parse_admin_branch, + 'access' : _parse_admin_access, + 'symbols' : _parse_admin_symbols, + 'locks' : _parse_admin_locks, + 'strict' : _parse_admin_strict, + 'comment' : _parse_admin_comment, + 'expand' : _parse_admin_expand, + 'desc' : None, + } + + def parse_rcs_admin(self): + while 1: + # Read initial token at beginning of line + token = self.ts.get() + + try: + f = self.admin_token_map[token] + except KeyError: + # We're done once we reach the description of the RCS tree + if token[0] in string.digits: + self.ts.unget(token) + return + else: + # Chew up "newphrase" + # warn("Unexpected RCS token: $token\n") + pass + else: + if f is None: + self.ts.unget(token) + return + else: + f(self, token) + + def _parse_rcs_tree_entry(self, revision): + # Parse date + self.ts.match('date') + date = self.ts.get() + self.ts.match(';') + + # Convert date into timestamp + date_fields = string.split(date, '.') + # According to rcsfile(5): the year "contains just the last two + # digits of the year for years from 1900 through 1999, and all the + # digits of years thereafter". + if len(date_fields[0]) == 2: + date_fields[0] = '19' + date_fields[0] + date_fields = map(string.atoi, date_fields) + EPOCH = 1970 + if date_fields[0] < EPOCH: + raise ValueError, 'invalid year' + timestamp = calendar.timegm(tuple(date_fields) + (0, 0, 0,)) + + # Parse author + ### NOTE: authors containing whitespace are violations of the + ### RCS specification. We are making an allowance here because + ### CVSNT is known to produce these sorts of authors. + self.ts.match('author') + author = ' '.join(self._read_until_semicolon()) + + # Parse state + self.ts.match('state') + state = '' + while 1: + token = self.ts.get() + if token == ';': + break + state = state + token + ' ' + state = state[:-1] # toss the trailing space + + # Parse branches + self.ts.match('branches') + branches = self._read_until_semicolon() + + # Parse revision of next delta in chain + self.ts.match('next') + next = self.ts.get() + if next == ';': + next = None + else: + self.ts.match(';') + + # there are some files with extra tags in them. for example: + # owner 640; + # group 15; + # permissions 644; + # hardlinks @configure.in@; + # this is "newphrase" in RCSFILE(5). we just want to skip over these. + while 1: + token = self.ts.get() + if token == 'desc' or token[0] in string.digits: + self.ts.unget(token) + break + # consume everything up to the semicolon + self._read_until_semicolon() + + self.sink.define_revision(revision, timestamp, author, state, branches, + next) + + def parse_rcs_tree(self): + while 1: + revision = self.ts.get() + + # End of RCS tree description ? + if revision == 'desc': + self.ts.unget(revision) + return + + self._parse_rcs_tree_entry(revision) + + def parse_rcs_description(self): + self.ts.match('desc') + self.sink.set_description(self.ts.get()) + + def parse_rcs_deltatext(self): + while 1: + revision = self.ts.get() + if revision is None: + # EOF + break + text, sym2, log, sym1 = self.ts.mget(4) + if sym1 != 'log': + print `text[:100], sym2[:100], log[:100], sym1[:100]` + raise RCSExpected(sym1, 'log') + if sym2 != 'text': + raise RCSExpected(sym2, 'text') + ### need to add code to chew up "newphrase" + self.sink.set_revision_info(revision, log, text) + + def parse(self, file, sink): + self.ts = self.stream_class(file) + self.sink = sink + + self.parse_rcs_admin() + + # let sink know when the admin section has been completed + self.sink.admin_completed() + + self.parse_rcs_tree() + + # many sinks want to know when the tree has been completed so they can + # do some work to prep for the arrival of the deltatext + self.sink.tree_completed() + + self.parse_rcs_description() + self.parse_rcs_deltatext() + + # easiest for us to tell the sink it is done, rather than worry about + # higher level software doing it. + self.sink.parse_completed() + + self.ts = self.sink = None + +# -------------------------------------------------------------------------- diff --git a/cvs2svn_rcsparse/debug.py b/cvs2svn_rcsparse/debug.py new file mode 100644 index 0000000..cfeaf2b --- /dev/null +++ b/cvs2svn_rcsparse/debug.py @@ -0,0 +1,122 @@ +# -*-python-*- +# +# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. +# +# By using this file, you agree to the terms and conditions set forth in +# the LICENSE.html file which can be found at the top level of the ViewVC +# distribution or at http://viewvc.org/license-1.html. +# +# For more information, visit http://viewvc.org/ +# +# ----------------------------------------------------------------------- + +"""debug.py: various debugging tools for the rcsparse package.""" + +import time + +from __init__ import parse +import common + + +class DebugSink(common.Sink): + def set_head_revision(self, revision): + print 'head:', revision + + def set_principal_branch(self, branch_name): + print 'branch:', branch_name + + def define_tag(self, name, revision): + print 'tag:', name, '=', revision + + def set_comment(self, comment): + print 'comment:', comment + + def set_description(self, description): + print 'description:', description + + def define_revision(self, revision, timestamp, author, state, + branches, next): + print 'revision:', revision + print ' timestamp:', timestamp + print ' author:', author + print ' state:', state + print ' branches:', branches + print ' next:', next + + def set_revision_info(self, revision, log, text): + print 'revision:', revision + print ' log:', log + print ' text:', text[:100], '...' + + +class DumpSink(common.Sink): + """Dump all the parse information directly to stdout. + + The output is relatively unformatted and untagged. It is intended as a + raw dump of the data in the RCS file. A copy can be saved, then changes + made to the parsing engine, then a comparison of the new output against + the old output. + """ + def __init__(self): + global sha + import sha + + def set_head_revision(self, revision): + print revision + + def set_principal_branch(self, branch_name): + print branch_name + + def define_tag(self, name, revision): + print name, revision + + def set_comment(self, comment): + print comment + + def set_description(self, description): + print description + + def define_revision(self, revision, timestamp, author, state, + branches, next): + print revision, timestamp, author, state, branches, next + + def set_revision_info(self, revision, log, text): + print revision, sha.new(log).hexdigest(), sha.new(text).hexdigest() + + def tree_completed(self): + print 'tree_completed' + + def parse_completed(self): + print 'parse_completed' + + +def dump_file(fname): + parse(open(fname, 'rb'), DumpSink()) + +def time_file(fname): + f = open(fname, 'rb') + s = common.Sink() + t = time.time() + parse(f, s) + t = time.time() - t + print t + +def _usage(): + print 'This is normally a module for importing, but it has a couple' + print 'features for testing as an executable script.' + print 'USAGE: %s COMMAND filename,v' % sys.argv[0] + print ' where COMMAND is one of:' + print ' dump: filename is "dumped" to stdout' + print ' time: filename is parsed with the time written to stdout' + sys.exit(1) + +if __name__ == '__main__': + import sys + if len(sys.argv) != 3: + _usage() + if sys.argv[1] == 'dump': + dump_file(sys.argv[2]) + elif sys.argv[1] == 'time': + time_file(sys.argv[2]) + else: + _usage() diff --git a/cvs2svn_rcsparse/default.py b/cvs2svn_rcsparse/default.py new file mode 100644 index 0000000..57f9fc6 --- /dev/null +++ b/cvs2svn_rcsparse/default.py @@ -0,0 +1,172 @@ +# -*-python-*- +# +# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. +# +# By using this file, you agree to the terms and conditions set forth in +# the LICENSE.html file which can be found at the top level of the ViewVC +# distribution or at http://viewvc.org/license-1.html. +# +# For more information, visit http://viewvc.org/ +# +# ----------------------------------------------------------------------- +# +# This file was originally based on portions of the blame.py script by +# Curt Hagenlocher. +# +# ----------------------------------------------------------------------- + +import string +import common + +class _TokenStream: + token_term = frozenset(string.whitespace + ';:') + + # the algorithm is about the same speed for any CHUNK_SIZE chosen. + # grab a good-sized chunk, but not too large to overwhelm memory. + # note: we use a multiple of a standard block size + CHUNK_SIZE = 192 * 512 # about 100k + +# CHUNK_SIZE = 5 # for debugging, make the function grind... + + def __init__(self, file): + self.rcsfile = file + self.idx = 0 + self.buf = self.rcsfile.read(self.CHUNK_SIZE) + if self.buf == '': + raise RuntimeError, 'EOF' + + def get(self): + "Get the next token from the RCS file." + + # Note: we can afford to loop within Python, examining individual + # characters. For the whitespace and tokens, the number of iterations + # is typically quite small. Thus, a simple iterative loop will beat + # out more complex solutions. + + buf = self.buf + lbuf = len(buf) + idx = self.idx + + while 1: + if idx == lbuf: + buf = self.rcsfile.read(self.CHUNK_SIZE) + if buf == '': + # signal EOF by returning None as the token + del self.buf # so we fail if get() is called again + return None + lbuf = len(buf) + idx = 0 + + if buf[idx] not in string.whitespace: + break + + idx = idx + 1 + + if buf[idx] in ';:': + self.buf = buf + self.idx = idx + 1 + return buf[idx] + + if buf[idx] != '@': + end = idx + 1 + token = '' + while 1: + # find token characters in the current buffer + while end < lbuf and buf[end] not in self.token_term: + end = end + 1 + token = token + buf[idx:end] + + if end < lbuf: + # we stopped before the end, so we have a full token + idx = end + break + + # we stopped at the end of the buffer, so we may have a partial token + buf = self.rcsfile.read(self.CHUNK_SIZE) + lbuf = len(buf) + idx = end = 0 + + self.buf = buf + self.idx = idx + return token + + # a "string" which starts with the "@" character. we'll skip it when we + # search for content. + idx = idx + 1 + + chunks = [ ] + + while 1: + if idx == lbuf: + idx = 0 + buf = self.rcsfile.read(self.CHUNK_SIZE) + if buf == '': + raise RuntimeError, 'EOF' + lbuf = len(buf) + i = string.find(buf, '@', idx) + if i == -1: + chunks.append(buf[idx:]) + idx = lbuf + continue + if i == lbuf - 1: + chunks.append(buf[idx:i]) + idx = 0 + buf = '@' + self.rcsfile.read(self.CHUNK_SIZE) + if buf == '@': + raise RuntimeError, 'EOF' + lbuf = len(buf) + continue + if buf[i + 1] == '@': + chunks.append(buf[idx:i+1]) + idx = i + 2 + continue + + chunks.append(buf[idx:i]) + + self.buf = buf + self.idx = i + 1 + + return ''.join(chunks) + +# _get = get +# def get(self): + token = self._get() + print 'T:', `token` + return token + + def match(self, match): + "Try to match the next token from the input buffer." + + token = self.get() + if token != match: + raise common.RCSExpected(token, match) + + def unget(self, token): + "Put this token back, for the next get() to return." + + # Override the class' .get method with a function which clears the + # overridden method then returns the pushed token. Since this function + # will not be looked up via the class mechanism, it should be a "normal" + # function, meaning it won't have "self" automatically inserted. + # Therefore, we need to pass both self and the token thru via defaults. + + # note: we don't put this into the input buffer because it may have been + # @-unescaped already. + + def give_it_back(self=self, token=token): + del self.get + return token + + self.get = give_it_back + + def mget(self, count): + "Return multiple tokens. 'next' is at the end." + result = [ ] + for i in range(count): + result.append(self.get()) + result.reverse() + return result + + +class Parser(common._Parser): + stream_class = _TokenStream diff --git a/cvs2svn_rcsparse/parse_rcs_file.py b/cvs2svn_rcsparse/parse_rcs_file.py new file mode 100644 index 0000000..215845d --- /dev/null +++ b/cvs2svn_rcsparse/parse_rcs_file.py @@ -0,0 +1,73 @@ +#!/usr/bin/python2 + +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2006-2007 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://cvs2svn.tigris.org/. +# ==================================================================== + +"""Parse an RCS file, showing the rcsparse callbacks that are called. + +This program is useful to see whether an RCS file has a problem (in +the sense of not being parseable by rcsparse) and also to illuminate +the correspondence between RCS file contents and rcsparse callbacks. + +The output of this program can also be considered to be a kind of +'canonical' format for RCS files, at least in so far as rcsparse +returns all relevant information in the file and provided that the +order of callbacks is always the same.""" + + +import sys +import os + + +class Logger: + def __init__(self, f, name): + self.f = f + self.name = name + + def __call__(self, *args): + self.f.write( + '%s(%s)\n' % (self.name, ', '.join(['%r' % arg for arg in args]),) + ) + + +class LoggingSink: + def __init__(self, f): + self.f = f + + def __getattr__(self, name): + return Logger(self.f, name) + + +if __name__ == '__main__': + # Since there is nontrivial logic in __init__.py, we have to import + # parse() via that file. First make sure that the directory + # containing this script is in the path: + sys.path.insert(0, os.path.dirname(sys.argv[0])) + + from __init__ import parse + + if sys.argv[1:]: + for path in sys.argv[1:]: + if os.path.isfile(path) and path.endswith(',v'): + parse( + open(path, 'rb'), LoggingSink(sys.stdout) + ) + else: + sys.stderr.write('%r is being ignored.\n' % path) + else: + parse(sys.stdin, LoggingSink(sys.stdout)) + + diff --git a/cvs2svn_rcsparse/rcparse_redundant_work.patch b/cvs2svn_rcsparse/rcparse_redundant_work.patch new file mode 100644 index 0000000..b574dd2 --- /dev/null +++ b/cvs2svn_rcsparse/rcparse_redundant_work.patch @@ -0,0 +1,99 @@ +=== modified file 'cvs2svn_rcsparse/default.py' +--- cvs2svn_rcsparse/default.py 2007-11-18 23:05:32 +0000 ++++ cvs2svn_rcsparse/default.py 2010-01-23 10:21:47 +0000 +@@ -19,7 +19,7 @@ + import common + + class _TokenStream: +- token_term = string.whitespace + ';:' ++ token_term = frozenset(string.whitespace + ';:') + + # the algorithm is about the same speed for any CHUNK_SIZE chosen. + # grab a good-sized chunk, but not too large to overwhelm memory. +@@ -44,15 +44,17 @@ + # out more complex solutions. + + buf = self.buf ++ lbuf = len(buf) + idx = self.idx + + while 1: +- if idx == len(buf): ++ if idx == lbuf: + buf = self.rcsfile.read(self.CHUNK_SIZE) + if buf == '': + # signal EOF by returning None as the token + del self.buf # so we fail if get() is called again + return None ++ lbuf = len(buf) + idx = 0 + + if buf[idx] not in string.whitespace: +@@ -60,7 +62,7 @@ + + idx = idx + 1 + +- if buf[idx] == ';' or buf[idx] == ':': ++ if buf[idx] in ';:': + self.buf = buf + self.idx = idx + 1 + return buf[idx] +@@ -70,17 +72,18 @@ + token = '' + while 1: + # find token characters in the current buffer +- while end < len(buf) and buf[end] not in self.token_term: ++ while end < lbuf and buf[end] not in self.token_term: + end = end + 1 + token = token + buf[idx:end] + +- if end < len(buf): ++ if end < lbuf: + # we stopped before the end, so we have a full token + idx = end + break + + # we stopped at the end of the buffer, so we may have a partial token + buf = self.rcsfile.read(self.CHUNK_SIZE) ++ lbuf = len(buf) + idx = end = 0 + + self.buf = buf +@@ -94,22 +97,24 @@ + chunks = [ ] + + while 1: +- if idx == len(buf): ++ if idx == lbuf: + idx = 0 + buf = self.rcsfile.read(self.CHUNK_SIZE) + if buf == '': + raise RuntimeError, 'EOF' ++ lbuf = len(buf) + i = string.find(buf, '@', idx) + if i == -1: + chunks.append(buf[idx:]) +- idx = len(buf) ++ idx = lbuf + continue +- if i == len(buf) - 1: ++ if i == lbuf - 1: + chunks.append(buf[idx:i]) + idx = 0 + buf = '@' + self.rcsfile.read(self.CHUNK_SIZE) + if buf == '@': + raise RuntimeError, 'EOF' ++ lbuf = len(buf) + continue + if buf[i + 1] == '@': + chunks.append(buf[idx:i+1]) +@@ -121,7 +126,7 @@ + self.buf = buf + self.idx = i + 1 + +- return string.join(chunks, '') ++ return ''.join(chunks) + + # _get = get + # def get(self): + diff --git a/cvs2svn_rcsparse/run-tests.py b/cvs2svn_rcsparse/run-tests.py new file mode 100644 index 0000000..eb9c3ea --- /dev/null +++ b/cvs2svn_rcsparse/run-tests.py @@ -0,0 +1,73 @@ +#!/usr/bin/python2 + +# (Be in -*- python -*- mode.) +# +# ==================================================================== +# Copyright (c) 2007 CollabNet. All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://subversion.tigris.org/license-1.html. +# If newer versions of this license are posted there, you may use a +# newer version instead, at your option. +# +# This software consists of voluntary contributions made by many +# individuals. For exact contribution history, see the revision +# history and logs, available at http://viewvc.tigris.org/. +# ==================================================================== + +"""Run tests of rcsparse code.""" + +import sys +import os +import glob +from cStringIO import StringIO +from difflib import Differ + +# Since there is nontrivial logic in __init__.py, we have to import +# parse() via that file. First make sure that the directory +# containing this script is in the path: +script_dir = os.path.dirname(sys.argv[0]) +sys.path.insert(0, script_dir) + +from __init__ import parse +from parse_rcs_file import LoggingSink + + +test_dir = os.path.join(script_dir, 'test-data') + +filelist = glob.glob(os.path.join(test_dir, '*,v')) +filelist.sort() + +all_tests_ok = 1 + +for filename in filelist: + sys.stderr.write('%s: ' % (filename,)) + f = StringIO() + try: + parse(open(filename, 'rb'), LoggingSink(f)) + except Exception, e: + sys.stderr.write('Error parsing file: %s!\n' % (e,)) + all_tests_ok = 0 + else: + output = f.getvalue() + + expected_output_filename = filename[:-2] + '.out' + expected_output = open(expected_output_filename, 'rb').read() + + if output == expected_output: + sys.stderr.write('OK\n') + else: + sys.stderr.write('Output does not match expected output!\n') + differ = Differ() + for diffline in differ.compare( + expected_output.splitlines(1), output.splitlines(1) + ): + sys.stderr.write(diffline) + all_tests_ok = 0 + +if all_tests_ok: + sys.exit(0) +else: + sys.exit(1) + diff --git a/cvs2svn_rcsparse/texttools.py b/cvs2svn_rcsparse/texttools.py new file mode 100644 index 0000000..7c713eb --- /dev/null +++ b/cvs2svn_rcsparse/texttools.py @@ -0,0 +1,348 @@ +# -*-python-*- +# +# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. +# +# By using this file, you agree to the terms and conditions set forth in +# the LICENSE.html file which can be found at the top level of the ViewVC +# distribution or at http://viewvc.org/license-1.html. +# +# For more information, visit http://viewvc.org/ +# +# ----------------------------------------------------------------------- + +import string + +# note: this will raise an ImportError if it isn't available. the rcsparse +# package will recognize this and switch over to the default parser. +from mx import TextTools + +import common + + +# for convenience +_tt = TextTools + +_idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256)) +_idchar_list.remove('$') +_idchar_list.remove(',') +#_idchar_list.remove('.') # leave as part of 'num' symbol +_idchar_list.remove(':') +_idchar_list.remove(';') +_idchar_list.remove('@') +_idchar = string.join(_idchar_list, '') +_idchar_set = _tt.set(_idchar) + +_onechar_token_set = _tt.set(':;') + +_not_at_set = _tt.invset('@') + +_T_TOKEN = 30 +_T_STRING_START = 40 +_T_STRING_SPAN = 60 +_T_STRING_END = 70 + +_E_COMPLETE = 100 # ended on a complete token +_E_TOKEN = 110 # ended mid-token +_E_STRING_SPAN = 130 # ended within a string +_E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@) + +_SUCCESS = +100 + +_EOF = 'EOF' +_CONTINUE = 'CONTINUE' +_UNUSED = 'UNUSED' + + +# continuation of a token over a chunk boundary +_c_token_table = ( + (_T_TOKEN, _tt.AllInSet, _idchar_set), + ) + +class _mxTokenStream: + + # the algorithm is about the same speed for any CHUNK_SIZE chosen. + # grab a good-sized chunk, but not too large to overwhelm memory. + # note: we use a multiple of a standard block size + CHUNK_SIZE = 192 * 512 # about 100k + +# CHUNK_SIZE = 5 # for debugging, make the function grind... + + def __init__(self, file): + self.rcsfile = file + self.tokens = [ ] + self.partial = None + + self.string_end = None + + def _parse_chunk(self, buf, start=0): + "Get the next token from the RCS file." + + buflen = len(buf) + + assert start < buflen + + # construct a tag table which refers to the buffer we need to parse. + table = ( + #1: ignore whitespace. with or without whitespace, move to the next rule. + (None, _tt.AllInSet, _tt.whitespace_set, +1), + + #2 + (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS), + + #3: accumulate token text and exit, or move to the next rule. + (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2), + + #4 + (_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS), + + #5: single character tokens exit immediately, or move to the next rule + (_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2), + + #6 + (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS), + + #7: if this isn't an '@' symbol, then we have a syntax error (go to a + # negative index to indicate that condition). otherwise, suck it up + # and move to the next rule. + (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'), + + #8 + (None, _tt.Is, '@', +4, +1), + #9 + (buf, _tt.Is, '@', +1, -1), + #10 + (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1), + #11 + (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS), + + #12 + (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS), + + #13: suck up everything that isn't an AT. go to next rule to look for EOF + (buf, _tt.AllInSet, _not_at_set, 0, +1), + + #14: go back to look for double AT if we aren't at the end of the string + (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS), + ) + + # Fast, texttools may be, but it's somewhat lacking in clarity. + # Here's an attempt to document the logic encoded in the table above: + # + # Flowchart: + # _____ + # / /\ + # 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11 + # | \/ \/ \/ /\ \/ + # \ 4 6 12 14 / + # \_______/_____/ \ / / + # \ 13 / + # \__________________________________________/ + # + # #1: Skip over any whitespace. + # #2: If now EOF, exit with code _E_COMPLETE. + # #3: If we have a series of characters in _idchar_set, then: + # #4: Output them as a token, and go back to #1. + # #5: If we have a character in _onechar_token_set, then: + # #6: Output it as a token, and go back to #1. + # #7: If we do not have an '@', then error. + # If we do, then log a _T_STRING_START and continue. + # #8: If we have another '@', continue on to #9. Otherwise: + # #12: If now EOF, exit with code _E_STRING_SPAN. + # #13: Record the slice up to the next '@' (or EOF). + # #14: If now EOF, exit with code _E_STRING_SPAN. + # Otherwise, go back to #8. + # #9: If we have another '@', then we've just seen an escaped + # (by doubling) '@' within an @-string. Record a slice including + # just one '@' character, and jump back to #8. + # Otherwise, we've *either* seen the terminating '@' of an @-string, + # *or* we've seen one half of an escaped @@ sequence that just + # happened to be split over a chunk boundary - in either case, + # we continue on to #10. + # #10: Log a _T_STRING_END. + # #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1. + + success, taglist, idx = _tt.tag(buf, table, start) + + if not success: + ### need a better way to report this error + raise common.RCSIllegalCharacter() + assert idx == buflen + + # pop off the last item + last_which = taglist.pop() + + i = 0 + tlen = len(taglist) + while i < tlen: + if taglist[i] == _T_STRING_START: + j = i + 1 + while j < tlen: + if taglist[j] == _T_STRING_END: + s = _tt.join(taglist, '', i+1, j) + del taglist[i:j] + tlen = len(taglist) + taglist[i] = s + break + j = j + 1 + else: + assert last_which == _E_STRING_SPAN + s = _tt.join(taglist, '', i+1) + del taglist[i:] + self.partial = (_T_STRING_SPAN, [ s ]) + break + i = i + 1 + + # figure out whether we have a partial last-token + if last_which == _E_TOKEN: + self.partial = (_T_TOKEN, [ taglist.pop() ]) + elif last_which == _E_COMPLETE: + pass + elif last_which == _E_STRING_SPAN: + assert self.partial + else: + assert last_which == _E_STRING_END + self.partial = (_T_STRING_END, [ taglist.pop() ]) + + taglist.reverse() + taglist.extend(self.tokens) + self.tokens = taglist + + def _set_end(self, taglist, text, l, r, subtags): + self.string_end = l + + def _handle_partial(self, buf): + which, chunks = self.partial + if which == _T_TOKEN: + success, taglist, idx = _tt.tag(buf, _c_token_table) + if not success: + # The start of this buffer was not a token. So the end of the + # prior buffer was a complete token. + self.tokens.insert(0, string.join(chunks, '')) + else: + assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \ + and taglist[0][1] == 0 and taglist[0][2] == idx + if idx == len(buf): + # + # The whole buffer was one huge token, so we may have a + # partial token again. + # + # Note: this modifies the list of chunks in self.partial + # + chunks.append(buf) + + # consumed the whole buffer + return len(buf) + + # got the rest of the token. + chunks.append(buf[:idx]) + self.tokens.insert(0, string.join(chunks, '')) + + # no more partial token + self.partial = None + + return idx + + if which == _T_STRING_END: + if buf[0] != '@': + self.tokens.insert(0, string.join(chunks, '')) + return 0 + chunks.append('@') + start = 1 + else: + start = 0 + + self.string_end = None + string_table = ( + (None, _tt.Is, '@', +3, +1), + (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1), + (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS), + + (None, _tt.EOF, _tt.Here, +1, _SUCCESS), + + # suck up everything that isn't an AT. move to next rule to look + # for EOF + (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1), + + # go back to look for double AT if we aren't at the end of the string + (None, _tt.EOF, _tt.Here, -5, _SUCCESS), + ) + + success, unused, idx = _tt.tag(buf, string_table, + start, len(buf), chunks) + + # must have matched at least one item + assert success + + if self.string_end is None: + assert idx == len(buf) + self.partial = (_T_STRING_SPAN, chunks) + elif self.string_end < len(buf): + self.partial = None + self.tokens.insert(0, string.join(chunks, '')) + else: + self.partial = (_T_STRING_END, chunks) + + return idx + + def _parse_more(self): + buf = self.rcsfile.read(self.CHUNK_SIZE) + if not buf: + return _EOF + + if self.partial: + idx = self._handle_partial(buf) + if idx is None: + return _CONTINUE + if idx < len(buf): + self._parse_chunk(buf, idx) + else: + self._parse_chunk(buf) + + return _CONTINUE + + def get(self): + try: + return self.tokens.pop() + except IndexError: + pass + + while not self.tokens: + action = self._parse_more() + if action == _EOF: + return None + + return self.tokens.pop() + + +# _get = get +# def get(self): + token = self._get() + print 'T:', `token` + return token + + def match(self, match): + if self.tokens: + token = self.tokens.pop() + else: + token = self.get() + + if token != match: + raise common.RCSExpected(token, match) + + def unget(self, token): + self.tokens.append(token) + + def mget(self, count): + "Return multiple tokens. 'next' is at the end." + while len(self.tokens) < count: + action = self._parse_more() + if action == _EOF: + ### fix this + raise RuntimeError, 'EOF hit while expecting tokens' + result = self.tokens[-count:] + del self.tokens[-count:] + return result + + +class Parser(common._Parser): + stream_class = _mxTokenStream -- cgit v1.2.3-65-gdbad