""" Tools for converting Cran packages to conda recipes. """ from __future__ import absolute_import, division, print_function import requests import yaml import unicodedata # try to import C dumper try: from yaml import CSafeDumper as SafeDumper except ImportError: from yaml import SafeDumper import re import sys from os import makedirs, listdir from os.path import join, exists, isfile, basename, isdir from itertools import chain import subprocess from difflib import get_close_matches from conda.install import rm_rf from conda import compat from conda_build import source, metadata CRAN_META = """\ {{% set posix = 'm2-' if win else '' %}} {{% set native = 'm2w64-' if win else '' %}} package: name: {packagename} # Note that conda versions cannot contain -, so any -'s in the version have # been replaced with _'s. version: "{conda_version}" source: {fn_key} {filename} {url_key} {cranurl} {git_url_key} {git_url} {git_tag_key} {git_tag} # You can add a hash for the file here, like md5 or sha1 # md5: 49448ba4863157652311cc5ea4fea3ea # sha1: 3bcfbee008276084cbb37a2b453963c61176a322 # patches: # List any patch files here # - fix.patch build: # If this is a new build for the same version, increment the build # number. If you do not include this key, it defaults to 0. # number: 1 # This is required to make R link correctly on Linux. rpaths: - lib/R/lib/ - lib/ {suggests} requirements: build:{build_depends} run:{run_depends} test: commands: # You can put additional test commands to be run here. - $R -e "library('{cran_packagename}')" # [not win] - "\\"%R%\\" -e \\"library('{cran_packagename}')\\"" # [win] # You can also put a file called run_test.py, run_test.sh, or run_test.bat # in the recipe that will be run at test time. # requires: # Put any additional test requirements here. about: {home_comment}home:{homeurl} license: {license} {summary_comment}summary:{summary} license_family: {license_family} # The original CRAN metadata for this package was: {cran_metadata} # See # http://docs.continuum.io/conda/build.html for # more information about meta.yaml """ CRAN_BUILD_SH = """\ #!/bin/bash # R refuses to build packages that mark themselves as Priority: Recommended mv DESCRIPTION DESCRIPTION.old grep -v '^Priority: ' DESCRIPTION.old > DESCRIPTION $R CMD INSTALL --build . # Add more build steps here, if they are necessary. # See # http://docs.continuum.io/conda/build.html # for a list of environment variables that are set during the build process. """ CRAN_BLD_BAT = """\ "%R%" CMD INSTALL --build . if errorlevel 1 exit 1 @rem Add more build steps here, if they are necessary. @rem See @rem http://docs.continuum.io/conda/build.html @rem for a list of environment variables that are set during the build process. """ INDENT = '\n - ' CRAN_KEYS = [ 'Site', 'Archs', 'Depends', 'Enhances', 'Imports', 'License', 'License_is_FOSS', 'License_restricts_use', 'LinkingTo', 'MD5sum', 'NeedsCompilation', 'OS_type', 'Package', 'Path', 'Priority', 'Suggests', 'Version', 'Title', 'Author', 'Maintainer', ] # The following base/recommended package names are derived from R's source # tree (R-3.0.2/share/make/vars.mk). Hopefully they don't change too much # between versions. R_BASE_PACKAGE_NAMES = ( 'base', 'tools', 'utils', 'grDevices', 'graphics', 'stats', 'datasets', 'methods', 'grid', 'splines', 'stats4', 'tcltk', 'compiler', 'parallel', ) R_RECOMMENDED_PACKAGE_NAMES = ( 'MASS', 'lattice', 'Matrix', 'nlme', 'survival', 'boot', 'cluster', 'codetools', 'foreign', 'KernSmooth', 'rpart', 'class', 'nnet', 'spatial', 'mgcv', ) # Stolen then tweaked from debian.deb822.PkgRelation.__dep_RE. VERSION_DEPENDENCY_REGEX = re.compile( r'^\s*(?P[a-zA-Z0-9.+\-]{1,})' r'(\s*\(\s*(?P[>=<]+)\s*' r'(?P[0-9a-zA-Z:\-+~.]+)\s*\))' r'?(\s*\[(?P[\s!\w\-]+)\])?\s*$' ) def dict_from_cran_lines(lines): d = {} for line in lines: if not line: continue try: (k, v) = line.split(': ', 1) except ValueError: sys.exit("Error: Could not parse metadata (%s)" % line) d[k] = v # if k not in CRAN_KEYS: # print("Warning: Unknown key %s" % k) d['orig_lines'] = lines return d def remove_package_line_continuations(chunk): """ >>> chunk = [ 'Package: A3', 'Version: 0.9.2', 'Depends: R (>= 2.15.0), xtable, pbapply', 'Suggests: randomForest, e1071', 'Imports: MASS, R.methodsS3 (>= 1.5.2), R.oo (>= 1.15.8), R.utils (>=', ' 1.27.1), matrixStats (>= 0.8.12), R.filesets (>= 2.3.0), ', ' sampleSelection, scatterplot3d, strucchange, systemfit', 'License: GPL (>= 2)', 'NeedsCompilation: no'] >>> remove_package_line_continuations(chunk) ['Package: A3', 'Version: 0.9.2', 'Depends: R (>= 2.15.0), xtable, pbapply', 'Suggests: randomForest, e1071', 'Imports: MASS, R.methodsS3 (>= 1.5.2), R.oo (>= 1.15.8), R.utils (>= 1.27.1), matrixStats (>= 0.8.12), R.filesets (>= 2.3.0), sampleSelection, scatterplot3d, strucchange, systemfit, rgl,' 'License: GPL (>= 2)', 'NeedsCompilation: no'] """ # NOQA continuation = (' ', '\t') continued_ix = None continued_line = None had_continuation = False accumulating_continuations = False chunk.append('') for (i, line) in enumerate(chunk): if line.startswith(continuation): line = ' ' + line.lstrip() if accumulating_continuations: assert had_continuation continued_line += line chunk[i] = None else: accumulating_continuations = True continued_ix = i - 1 continued_line = chunk[continued_ix] + line had_continuation = True chunk[i] = None else: if accumulating_continuations: assert had_continuation chunk[continued_ix] = continued_line accumulating_continuations = False continued_line = None continued_ix = None if had_continuation: # Remove the None(s). chunk = [c for c in chunk if c] chunk.append('') return chunk def yaml_quote_string(string): """ Quote a string for use in YAML. We can't just use yaml.dump because it adds ellipses to the end of the string, and it in general doesn't handle being placed inside an existing document very well. Note that this function is NOT general. """ return yaml.dump(string, Dumper=SafeDumper).replace('\n...\n', '').replace('\n', '\n ') def clear_trailing_whitespace(string): lines = [] for line in string.splitlines(): lines.append(line.rstrip()) return '\n'.join(lines) def get_package_metadata(cran_url, package, session): url = cran_url + 'web/packages/' + package + '/DESCRIPTION' r = session.get(url) try: r.raise_for_status() except requests.exceptions.HTTPError as e: if e.response.status_code == 404: sys.exit("ERROR: %s (404 Not Found)" % url) raise DESCRIPTION = r.text d = dict_from_cran_lines(remove_package_line_continuations(DESCRIPTION.splitlines())) d['orig_description'] = DESCRIPTION return d def get_latest_git_tag(): p = subprocess.Popen(['git', 'describe', '--abbrev=0', '--tags'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=source.WORK_DIR) stdout, stderr = p.communicate() stdout = stdout.decode('utf-8') stderr = stderr.decode('utf-8') if stderr or p.returncode: sys.exit("Error: git tag failed (%s)" % stderr) tags = stdout.strip().splitlines() if not tags: sys.exit("Error: no tags found") print("Using tag %s" % tags[-1]) return tags[-1] def get_session(output_dir, verbose=True, cache=[]): if cache: return cache[0] session = requests.Session() try: import cachecontrol import cachecontrol.caches except ImportError: if verbose: print("Tip: install CacheControl to cache the CRAN metadata") else: session = cachecontrol.CacheControl(session, cache=cachecontrol.caches.FileCache(join(output_dir, '.web_cache'))) cache.append(session) return session def get_cran_metadata(cran_url, output_dir, verbose=True): session = get_session(output_dir, verbose=verbose) if verbose: print("Fetching metadata from %s" % cran_url) r = session.get(cran_url + "src/contrib/PACKAGES") r.raise_for_status() PACKAGES = r.text package_list = [remove_package_line_continuations(i.splitlines()) for i in PACKAGES.split('\n\n')] return {d['Package'].lower(): d for d in map(dict_from_cran_lines, package_list)} def main(args, parser): if len(args.packages) > 1 and args.version_compare: parser.error("--version-compare only works with one package at a time") if not args.update_outdated and not args.packages: parser.error("At least one package must be supplied") package_dicts = {} [output_dir] = args.output_dir cran_metadata = get_cran_metadata(args.cran_url, output_dir) if args.update_outdated: args.packages = get_outdated(output_dir, cran_metadata, args.packages) for pkg in args.packages: rm_rf(join(args.output_dir[0], 'r-' + pkg)) while args.packages: package = args.packages.pop() is_github_url = 'github.com' in package url = package if is_github_url: rm_rf(source.WORK_DIR) source.git_source({'git_url': package}, '.') git_tag = args.git_tag[0] if args.git_tag else get_latest_git_tag() p = subprocess.Popen(['git', 'checkout', git_tag], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=source.WORK_DIR) stdout, stderr = p.communicate() stdout = stdout.decode('utf-8') stderr = stderr.decode('utf-8') if p.returncode: sys.exit("Error: 'git checkout %s' failed (%s).\nInvalid tag?" % (git_tag, stderr.strip())) if stdout: print(stdout, file=sys.stdout) if stderr: print(stderr, file=sys.stderr) DESCRIPTION = join(source.WORK_DIR, "DESCRIPTION") if not isfile(DESCRIPTION): sub_description_pkg = join(source.WORK_DIR, 'pkg', "DESCRIPTION") sub_description_name = join(source.WORK_DIR, package.split('/')[-1], "DESCRIPTION") if isfile(sub_description_pkg): DESCRIPTION = sub_description_pkg elif isfile(sub_description_name): DESCRIPTION = sub_description_name else: sys.exit("%s does not appear to be a valid R package " "(no DESCRIPTION file in %s, %s)" % (package, sub_description_pkg, sub_description_name)) with open(DESCRIPTION) as f: description_text = clear_trailing_whitespace(f.read()) d = dict_from_cran_lines(remove_package_line_continuations( description_text.splitlines())) d['orig_description'] = description_text package = d['Package'].lower() cran_metadata[package] = d if package.startswith('r-'): package = package[2:] if package.endswith('/'): package = package[:-1] if package.lower() not in cran_metadata: sys.exit("Package %s not found" % package) # Make sure package is always uses the CRAN capitalization package = cran_metadata[package.lower()]['Package'] if not is_github_url: session = get_session(output_dir) cran_metadata[package.lower()].update(get_package_metadata(args.cran_url, package, session)) dir_path = join(output_dir, 'r-' + package.lower()) if exists(dir_path) and not args.version_compare: raise RuntimeError("directory already exists: %s" % dir_path) cran_package = cran_metadata[package.lower()] d = package_dicts.setdefault(package, { 'cran_packagename': package, 'packagename': 'r-' + package.lower(), 'build_depends': '', 'run_depends': '', # CRAN doesn't seem to have this metadata :( 'home_comment': '#', 'homeurl': '', 'summary_comment': '#', 'summary': '', }) if is_github_url: d['url_key'] = '' d['fn_key'] = '' d['git_url_key'] = 'git_url:' d['git_tag_key'] = 'git_tag:' d['filename'] = '' d['cranurl'] = '' d['git_url'] = url d['git_tag'] = git_tag else: d['url_key'] = 'url:' d['fn_key'] = 'fn:' d['git_url_key'] = '' d['git_tag_key'] = '' d['git_url'] = '' d['git_tag'] = '' if args.version: raise NotImplementedError("Package versions from CRAN are not yet implemented") [version] = args.version d['version'] = version d['cran_version'] = cran_package['Version'] # Conda versions cannot have -. Conda (verlib) will treat _ as a . d['conda_version'] = d['cran_version'].replace('-', '_') if args.version_compare: sys.exit(not version_compare(dir_path, d['conda_version'])) if not is_github_url: d['filename'] = "{cran_packagename}_{cran_version}.tar.gz".format(**d) if args.archive: d['cranurl'] = (INDENT + args.cran_url + 'src/contrib/' + d['filename'] + INDENT + args.cran_url + 'src/contrib/' + 'Archive/' + d['cran_packagename'] + '/' + d['filename']) else: d['cranurl'] = ' ' + args.cran_url + 'src/contrib/' + d['filename'] d['cran_metadata'] = '\n'.join(['# %s' % l for l in cran_package['orig_lines'] if l]) # XXX: We should maybe normalize these d['license'] = cran_package.get("License", "None") # Tend towards the more clear GPL3 and away from the ambiguity of GPL2. if 'GPL (>= 2)' in d['license'] or d['license'] == 'GPL': d['license_family'] = 'GPL3' else: d['license_family'] = get_close_matches(d['license'], metadata.allowed_license_families, 1, 0.0)[0] if 'License_is_FOSS' in cran_package: d['license'] += ' (FOSS)' if cran_package.get('License_restricts_use', None) == 'yes': d['license'] += ' (Restricts use)' if "URL" in cran_package: d['home_comment'] = '' d['homeurl'] = ' ' + yaml_quote_string(cran_package['URL']) if 'Description' in cran_package: d['summary_comment'] = '' d['summary'] = ' ' + yaml_quote_string(cran_package['Description']) if "Suggests" in cran_package: d['suggests'] = "# Suggests: %s" % cran_package['Suggests'] else: d['suggests'] = '' # Every package depends on at least R. # I'm not sure what the difference between depends and imports is. depends = [s.strip() for s in cran_package.get('Depends', '').split(',') if s.strip()] imports = [s.strip() for s in cran_package.get('Imports', '').split(',') if s.strip()] links = [s.strip() for s in cran_package.get("LinkingTo", '').split(',') if s.strip()] dep_dict = {} for s in set(chain(depends, imports, links)): match = VERSION_DEPENDENCY_REGEX.match(s) if not match: sys.exit("Could not parse version from dependency of %s: %s" % (package, s)) name = match.group('name') archs = match.group('archs') relop = match.group('relop') or '' version = match.group('version') or '' version = version.replace('-', '_') # If there is a relop there should be a version assert not relop or version if archs: sys.exit("Don't know how to handle archs from dependency of " "package %s: %s" % (package, s)) dep_dict[name] = '{relop}{version}'.format(relop=relop, version=version) if 'R' not in dep_dict: dep_dict['R'] = '' for dep_type in ['build', 'run']: deps = [] for name in sorted(dep_dict): if name in R_BASE_PACKAGE_NAMES: continue if name == 'R': # Put R first # Regarless of build or run, and whether this is a recommended package or not, # it can only depend on 'r-base' since anything else can and will cause cycles # in the dependency graph. The cran metadata lists all dependencies anyway, even # those packages that are in the recommended group. r_name = 'r-base' # We don't include any R version restrictions because we # always build R packages against an exact R version deps.insert(0, '{indent}{r_name}'.format(indent=INDENT, r_name=r_name)) else: conda_name = 'r-' + name.lower() if dep_dict[name]: deps.append('{indent}{name} {version}'.format(name=conda_name, version=dep_dict[name], indent=INDENT)) else: deps.append('{indent}{name}'.format(name=conda_name, indent=INDENT)) if args.recursive: if not exists(join(output_dir, conda_name)): args.packages.append(name) if cran_package.get("NeedsCompilation", 'no') == 'yes': if dep_type == 'build': deps.append('{indent}posix # [win]'.format(indent=INDENT)) deps.append('{indent}{{{{native}}}}toolchain # [win]'.format(indent=INDENT)) deps.append('{indent}gcc # [not win]'.format(indent=INDENT)) d['%s_depends' % dep_type] = ''.join(deps) for package in package_dicts: d = package_dicts[package] name = d['packagename'] # Normalize the metadata values d = {k: unicodedata.normalize("NFKD", compat.text_type(v)).encode('ascii', 'ignore') .decode() for k, v in compat.iteritems(d)} makedirs(join(output_dir, name)) print("Writing recipe for %s" % package.lower()) with open(join(output_dir, name, 'meta.yaml'), 'w') as f: f.write(clear_trailing_whitespace(CRAN_META.format(**d))) with open(join(output_dir, name, 'build.sh'), 'w') as f: f.write(CRAN_BUILD_SH.format(**d)) with open(join(output_dir, name, 'bld.bat'), 'w') as f: f.write(CRAN_BLD_BAT.format(**d)) print("Done") def version_compare(recipe_dir, newest_conda_version): m = metadata.MetaData(recipe_dir) local_version = m.version() package = basename(recipe_dir) print("Local recipe for %s has version %s." % (package, local_version)) print("The version on CRAN for %s is %s." % (package, newest_conda_version)) return local_version == newest_conda_version def get_outdated(output_dir, cran_metadata, packages=()): to_update = [] recipes = listdir(output_dir) for recipe in recipes: if not recipe.startswith('r-') or not isdir(recipe): continue recipe_name = recipe[2:] for i, package in enumerate(packages): if package.endswith('/'): packages[i] = package[:-1] if packages and not (recipe_name in packages or recipe in packages): continue if recipe_name not in cran_metadata: print("Skipping %s, not found on CRAN" % recipe) continue up_to_date = version_compare(join(output_dir, recipe), cran_metadata[recipe_name]['Version'].replace('-', '_')) if up_to_date: print("%s is up-to-date." % recipe) continue print("Updating %s" % recipe) to_update.append(recipe_name) return to_update