Source code for sails.circular_plots

#!/usr/bin/python3

# vim: set expandtab ts=4 sw=4;

from csv import DictReader
from os import makedirs, symlink
from os.path import join
from shutil import copyfile
import warnings

import tempfile

import numpy as np

import matplotlib.pyplot as plt

# Note that we currently assume the use of 1000 for band widths to make life
# easier
BAND_WIDTH = 1000

CIRCOS_CONF_TEMPLATE = """
karyotype = {karyotype:s}
chromosomes_units = 1

<ideogram>
<spacing>
default = 0.005r
</spacing>
radius           = {circle_radius:f}r
thickness        = 50p
fill             = yes
stroke_color     = dgrey
stroke_thickness = 2p
show_label       = no
show_bands            = yes
fill_bands            = yes
band_stroke_thickness = 2
band_stroke_color     = white
band_transparency     = 4
</ideogram>

{ticks:s}

<image>
angle_orientation* = counterclockwise
radius* = {radius:d}p
<<include {circospath:s}/image.conf>>
</image>
<<include {circospath:s}/colors_fonts_patterns.conf>>
<<include {circospath:s}/housekeeping.conf>>
<links>
<link>
file          = {linksname:s}
radius        = 0.95r
color         = black_a4
bezier_radius = 0.1r
thickness     = 1
</link>
</links>

<image>
file = {outputimage:s}
</image>
"""


class CircosException(BaseException):
    def __init__(self, retcode, stdout, stderr):
        self.retcode = retcode
        self.stdout = stdout
        self.stderr = stderr

    def __str__(self):
        return '<CircosException: Return code: {}>'.format(self.retcode)


[docs]class CircosGroup(object): def __init__(self, name, c_idx=0, n_idx=0, circoscolour='unknown', rgbcolour=None, netmatreverse=False): self.name = name self.c_idx = c_idx self.n_idx = n_idx self.circoscolour = circoscolour if rgbcolour is None: self.rgbcolour = [0, 0, 0] else: # Make sure to take a copy self.rgbcolour = rgbcolour[:] self.netmatreverse = netmatreverse # Dictionary of regions self.regions = {} def __str__(self): return '<CircosGroup {}>'.format(self.name) def __repr__(self): return '<CircosGroup {}>'.format(self.name) def __len__(self): return len(self.regions) def add_region(self, name, offset, shortname, analysisid): """ Adds a new region to the group. Parameters ---------- name: str Region name offset: int Region offset value shortname: str Region short name (used in plotting) analysisid: int Data index to use when accessing region data """ self.regions[name] = {} self.regions[name]['offset'] = offset self.regions[name]['shortname'] = shortname self.regions[name]['analysisid'] = analysisid def find_region_idx(self, region): """ Find a region index from an name. Parameters ---------- region: str Region name Returns ------- int or None Region index or None if not found """ if region in self.regions: return self.regions[region]['analysisid'] return None def find_region_from_idx(self, idx): """ Find a region name from an index. Parameters ---------- idx: int Region index Returns ------- str or None Region name or None if not found """ for region in self.regions.keys(): if self.regions[region]['analysisid'] == idx: return region return None def has_region(self, region): return region in self.regions.keys() @property def circular_indices(self): """ Get a list of all region analysis indices in circular plotting order. Returns ------- list List of indices to use when accessing data in order to access data in the correct order for circular plotting. """ return [x[1]['analysisid'] for x in sorted(self.regions.items(), key=lambda x: x[1]['offset'])] @property def circular_regions(self): """ Get a list of the regions in circular plotting order Returns ------- list List of region names to use in the correct order for circular plotting. """ return [x[0] for x in sorted(self.regions.items(), key=lambda x: x[1]['offset'])] @property def circular_shortnames(self): """ Get a list of the region short names in circular plotting order Returns ------- list List of region short names names in the correct order for circular plotting. """ return [x[1]['shortname'] for x in sorted(self.regions.items(), key=lambda x: x[1]['offset'])] def circular_link_cache(self, groupname): """Return a dictionary of the regions and their information""" ret = {} for region in self.regions: ret[region] = (groupname, self.regions[region]['offset'], self.regions[region]['offset'] + BAND_WIDTH) return ret @property def netmat_indices(self): """ Get a list of all analysis indices in netmat plotting order. Returns ------- list List of indices to use when accessing data in order to access data in the correct order for netmat plotting. """ tmp = self.circular_indices if self.netmatreverse: tmp = tmp[::-1] return tmp @property def netmat_regions(self): """ Get a list of all region names in netmat plotting order. Returns ------- list List of names to use when accessing data in order to access data in the correct order for netmat plotting. """ tmp = self.circular_regions if self.netmatreverse: tmp = tmp[::-1] return tmp @property def netmat_shortnames(self): """ Get a list of the region short names in netmat plotting order Returns ------- list List of region short names names in the correct order for netmat plotting. """ tmp = self.circular_shortnames if self.netmatreverse: tmp = tmp[::-1] return tmp
[docs]class CircosHandler(object): """ Handler for producing plots using the Circos tool. Note that if you are creating plots for publication using Circos, you should cite the relevant publication: [Krzywinski2009]_. """ def __init__(self): self.groups = {} def has_region(self, region): """Check whether a region exists in any currently defined group""" for region in self.groups.keys(): if self.groups[region].has_region(region): return True return False def find_region_idx(self, region): """Find a region index""" for group in self.groups.keys(): idx = self.groups[group].find_region_idx(region) if idx is not None: return idx return None def find_region_from_idx(self, idx): """Find a region name from an index""" for group in self.groups.keys(): name = self.groups[group].find_region_from_idx(idx) if name is not None: return name return None @property def num_regions(self): return len(self.circular_regions) def colour_mapping(self, normalise=True): """ Return a dictionary of colour name to 3-tuples. Parameters ---------- normalise: bool, optional If True, colour names will be scaled from the standard 0-255 range to 0-1; this is primarily useful for matplotlib Returns ------- dict Maps colour names to 3-tuples of colour information. """ colours = {} for group in self.groups.keys(): colourname = self.groups[group].circoscolour rgbcolour = self.groups[group].rgbcolour if normalise: rgbcolour = [(x / 255.0) for x in rgbcolour] colours[colourname] = rgbcolour return colours @property def data_regions(self): """Return a list of region names in data order""" ret = [] for k in range(self.num_regions): ret.append(self.find_region_from_idx(k)) return ret @property def circular_indices(self): """Return a list of all region analysis indices in circular plotting order""" ret = [] for group in self.circular_groups: ret.extend(self.groups[group].circular_indices) return ret @property def circular_regions(self): """Return a list of the groups in ascending order""" ret = [] for group in self.circular_groups: ret.extend(self.groups[group].circular_regions) return ret @property def circular_shortnames(self): """Return a list of all shortnames in circular plotting order""" ret = [] for group in self.circular_groups: ret.extend(self.groups[group].circular_shortnames) return ret @property def circular_link_cache(self): """Returns a dictionary which caches the circular position information for links""" ret = {} for group in self.circular_groups: ret.update(self.groups[group].circular_link_cache(group)) return ret @property def netmat_indices(self): """Return a list of all region analysis indices in netmat plotting order""" ret = [] for group in self.netmat_groups: ret.extend(self.groups[group].netmat_indices) return ret @property def netmat_regions(self): """Return a list of all regions in netmat plotting order""" ret = [] for group in self.netmat_groups: ret.extend(self.groups[group].netmat_regions) return ret @property def netmat_shortnames(self): """Return a list of all shortnames in netmat plotting order""" ret = [] for group in self.netmat_groups: ret.extend(self.groups[group].netmat_shortnames) return ret @property def circular_groups(self): """Return a list of the groups in circular order""" return [x[0] for x in sorted(self.groups.items(), key=lambda x: x[1].c_idx)] @property def netmat_groups(self): """Return a list of the groups in netmat order""" return [x[0] for x in sorted(self.groups.items(), key=lambda x: x[1].n_idx)] @classmethod def from_csv_files(cls, group_filename, region_filename, analysis_column='AnalysisID'): """ This routine sets up a CircosHandler from two CSV files. The group file must contain a list of groups; in Circos terms these are known as the 'chromosomes'. This file must contain the columns: * Group: The name of the group: must be unique * CircularIdx: The ordering of the group (from 1..N) in the circular plot order The following columns are optional: * NetmatIdx: The ordering of the group (from 1..N) in the netmat plot order. If not provided, the CircularIdx will be used * CircosColour: The colour to use for the chromosome in circos format * RGBColour: A 3-tuple of the RGB (0-255) colour to use in non-circos (e.g. for netmats) If no colour is set, mid-grey will be used. * NetmatReverseOrder: If True, when sorting out netmat indices, the ordering within the group will be reversed. This is commonly used when going from circular to netmat order on the left hand side of the plot. If not set, False will be assumed. The region file must contain a list of regions; in Circos terms these are known as the 'bands' within each chromosome. This file must contain the columns: * Region: The name of the region * Group: The name of the group which the region should be placed in * GroupOffset: The integer location at which the region should be placed within the group. Note that by default, this should be a multiple of 1000. The first value within a region should be 0 * AnalysisID: The position of the data for the region in the analysis data. I.e. an index from 0-(N-1). This is used to extract information from, e.g. netmats in the correct order. Note that the name of this column can be varied using the optional 'analysis_column' argument to the function. This allows having CSV files with multiple data orderings in different columns. The following columns are optional: * ShortName: The short name to use on the outer labels. If this is not present, the Region name will be used. Parameters ---------- group_filename: str Filename of group CSV file to load (see above) region_filename: str Filename of the region CSV file to load (see above) analysis_column: str, optional Column name to use for loading analysis ordering (see above). Defaults to 'AnalysisID' Returns ------- CircosHandler CircosHandler object ready to be used for plotting. """ ret = cls() # Read the groups with open(group_filename, 'r') as f: reader = DictReader(f) for row in reader: name = row['Group'] if name in ret.groups: raise Exception("Group name {} defined more than once".format(name)) c_idx = int(row['CircularIdx']) n_idx = int(row.get('NetmatIdx', c_idx)) circoscolour = row.get('CircosColour', 'grey') rgbcolour = [int(x.strip()) for x in row.get('RGBColour', '128,128,128').split(',')] tmp = row.get('NetmatReverseOrder', 'False') if tmp.lower() in ['true', '1', 't', 'y', 'yes']: netmatreverse = True else: netmatreverse = False group = CircosGroup(name, c_idx, n_idx, circoscolour, rgbcolour, netmatreverse) ret.groups[name] = group # Now read each region with open(region_filename, 'r') as f: reader = DictReader(f) for row in reader: name = row['Region'] group = row['Group'] offset = int(row['GroupOffset']) shortname = row.get('ShortName', name) analysispos = int(row.get(analysis_column)) if group not in ret.groups: raise Exception("Group {} for ROI {} not defined".format(group, name)) ret.groups[group].add_region(name, offset, shortname, analysispos) return ret def karyotype(self): """Return a string containing the contents of a karyotype file""" s = [] # First of all the group definitions for group in self.circular_groups: g = self.groups[group] num_entries = len(g.regions) s.append('chr - {} {} 0 {} {}'.format(group, g.c_idx, num_entries * BAND_WIDTH, g.circoscolour)) # Now the region definitions for group in self.circular_groups: g = self.groups[group] for region in g.circular_regions: start = g.regions[region]['offset'] end = start + BAND_WIDTH s.append('band {} {} {} {} {} grey'.format(group, region, region, start, end)) return '\n'.join(s) def ticks(self, **kwargs): """ Return a string containing the contents of a ticks.conf file All parameters must be passed as kwargs. Parameters ---------- ticks: bool Show ticks around circle tick_labels: bool Show tick labels around circle Returns ------- str String containing ticks.conf content needed for the relevant plot configuration """ s = [] # A header s.append('show_ticks = {}'.format('yes' if kwargs.get('ticks', True) else 'no')) s.append('show_tick_labels = {}'.format('yes' if kwargs.get('tick_labels', True) else 'no')) s.append('') s.append('<ticks>') s.append('chromosomes_display_default = no') s.append('skip_first_label = no') s.append('skip_last_label = no') s.append('radius = dims(ideogram,radius_outer)') s.append('tick_separation = 2p') s.append('label_separation = 5p') s.append('multiplier = 1') s.append('color = black') s.append('thickness = 4p') s.append('size = 20p') s.append('show_label = yes') s.append('label_size = 40p') s.append('label_offset = 10p') # Now the region definitions for group in self.circular_groups: g = self.groups[group] for region in g.circular_regions: shortname = g.regions[region]['shortname'] start = int(g.regions[region]['offset']) s.append('<tick>') s.append('chromosomes = {}'.format(group)) s.append('position = {}u'.format(start + (BAND_WIDTH // 2))) s.append('format = {}'.format(shortname)) s.append('</tick>') s.append('</ticks>') return '\n'.join(s) def links(self, data, linkcolors=None, defaultlinkcolor='black'): """ Parameters ---------- data: ndarray an (N x N) matrix containing thickness (and np.nan where no connection is desired) linkcolors: tuple, None or numpy array, optional One of 1) a tuple of three strings (zerocolor, negcolor, poscolor); e.g. ('black', 'blue', 'red), 2) None (in which case ('black', 'blue', 'red') will be assumed) or 3) a numpy array of dtype object (N, N) with a specific color string in each entry In the tuple case, all data values which are identically 0 will be coloured as zerocolor, all negative values as negcolor and all positive values as poscolor. In the numpy array case each connection will be set to the colour as specified in the same position in the array. If the entry is None then defaultcolor will be used defaultlinkcolor: str, optionals defaults to 'black'. Only used if linkcolors is a numpy array and None entries are found Returns ------- str String containing links.conf content needed for the relevant plot configuration """ s = [] if linkcolors is None: # zero, neg, pos linkcolors = ('black', 'blue', 'red') num_regions = self.num_regions if data.shape != (num_regions, num_regions): raise Exception("Data must be {} x {}".format(num_regions, num_regions)) # Generate a cache of map lookups and region IDs to names cache = self.circular_link_cache region_names = self.data_regions linefmt = "{} {} {} {} {} {} thickness={}p,color={}" for row in range(num_regions): for col in range(num_regions): val = data[row, col] # Skip any entries which are NaN if np.isnan(val): continue if isinstance(linkcolors, tuple): if val == 0: color = linkcolors[0] elif val < 0: color = linkcolors[1] else: color = linkcolors[2] else: # Assume a numpy array of strings color = linkcolors[row, col] if color is None: color = defaultlinkcolor reg_from = region_names[row] reg_to = region_names[col] # Note that thickness must be positive s.append(linefmt.format(cache[reg_from][0], cache[reg_from][1], cache[reg_from][2], cache[reg_to][0], cache[reg_to][1], cache[reg_to][2], np.abs(val), color)) return '\n'.join(s) def gen_circos_files(self, data, karyotype_name, output_name, radius=1500, circos_path='/etc/circos', output_dir=None, **kwargs): """ Create a Circos plot. Note that additional kwargs parameters can be passed to this routine. They will be passed to the ticks() and links() routines. See those routines for details of the additional parameters available. Parameters ---------- data : ndarray a (N x N) matrix containing thickness (and np.nan where no connection is desired) karyotype_name: str the filename of your karyotype file on disk output_name: str output basename of configuration files (with no extension) radius: int, optional in pixels (defaults to 1500) circos_path: str, optional where to find the circos configuration files (defaults to /etc/circos) output_name: str, optional Directory in which to store output circle_prop: float, optional proportion of image to fill circle with (defaults to 0.85). Must be in range 0-1. Note that if you are creating plots for publication using Circos, you should cite the relevant publication: [Krzywinski2009]_. """ linksname = output_name + '.txt' linkspath = linksname if output_dir is not None: linkspath = join(output_dir, linksname) imagename = output_name + '.png' confpath = output_name + '.conf' if output_dir is not None: confpath = join(output_dir, confpath) linkcolors = kwargs.get('linkcolors', None) defaultlinkcolor = kwargs.get('defaultlinkcolor', 'black') circle_radius = kwargs.get('circle_prop', 0.85) # First of all, create the .txt links file f = open(linkspath, 'w') f.write(self.links(data, linkcolors, defaultlinkcolor)) f.close() # Now the config file - pass all keyword arguments down ticks = self.ticks(**kwargs) f = open(confpath, 'w') f.write(CIRCOS_CONF_TEMPLATE.format(karyotype=karyotype_name, ticks=ticks, radius=radius, circospath=circos_path, linksname=linksname, outputimage=imagename, circle_radius=circle_radius)) f.close() def gen_circos_plot(self, data, output_name, radius=1500, circos_path='/etc/circos', **kwargs): """ Generate a circos plot output without leaving behind the configuration files. All arguments to this function are the same as gen_circos_files except that you do not specify a karyotype_name as this is unnecessary. Note that if you are creating plots for publication using Circos, you should cite the relevant publication: [Krzywinski2009]_. """ try: from subprocess import run, PIPE except ImportError: warnings.warn("subprocess.run is not present; this routine is not supported on python2") return with tempfile.TemporaryDirectory() as d: # Generate the karyotype f_k = open(join(d, 'karyotype.txt'), 'w') f_k.write(self.karyotype()) f_k.close() # Generate the config files self.gen_circos_files(data, 'karyotype.txt', 'circos', radius=radius, circos_path=circos_path, output_dir=d, **kwargs) # Set up the symlinks which seem to be necessary makedirs(join(d, 'etc')) symlink(join(circos_path, 'tracks'), join(d, 'etc', 'tracks')) # Now run circos and check for the error code # Can't use check_output as we're still on python3.5 (Debian stretch) r = run(['circos', '-conf', 'circos.conf'], cwd=d, stdout=PIPE, stderr=PIPE) if r.returncode != 0: raise CircosException(r.returncode, r.stdout, r.stderr) # Move files to final location copyfile(join(d, 'circos.png'), output_name + '.png') copyfile(join(d, 'circos.svg'), output_name + '.svg') def plot_netmat(self, data, ax=None, labels=True, hregions='top', vregions='right', show_xticks=True, show_yticks=True, showgrid=True, **kwargs): """ Plot a netmat with coloured bars to represent the grouping of the regions. Required arguments: data: a (N, N) matrix of data to be plotted Optional arguments: ax: An axes object on which to plot. One will be created on a new figure if not provided. labels: If True, use the shortnames, if False do not plot labels, if a list, use the list as the labels (must be N long) hregions: Location to place horizontal coloured bars indicating regions of ROIs. One of 'top', 'bottom', 'off'. Defaults to 'top' vregions: Location to place vertical coloured bars indicating regions of ROIs. One of 'left', 'right', 'off'. Defaults to 'right' show_xticks: Whether to show the xticks on the axis show_yticks: Whether to show the yticks on the axis showgrid: If True, place a grid on the image to delineate the regions. Keyword arguments: label_fontsize: will be passed to plot_netmat to set the font size Any additional keyword arguments will be passed down to pcolormesh to allow setting things such as the colormap """ from sails.plotting import plot_netmat # We need the colours in a normal form colour_dict = self.colour_mapping(normalise=True) colour_names = [] tick_pos = [] count = 0 if ax is None: plt.figure() ax = plt.subplot(1, 1, 1) # We need to know which colour each ROI is for group in self.netmat_groups: g = self.groups[group] colour_names.extend([g.circoscolour] * len(g)) tick_pos.append(count + len(g)) count += len(g) xtick_pos = None ytick_pos = None if show_xticks: xtick_pos = tick_pos if show_yticks: ytick_pos = tick_pos if labels is True: labels = self.netmat_shortnames elif labels is False: labels = None # Otherwise assume that they have provided a list of labels # which they want to use # Order data properly - can't think of a better way to do this idx = np.array(self.netmat_indices) new_data = data[idx, :][:, idx] return plot_netmat(new_data, ax=ax, colors=colour_dict, cnames=colour_names, xtick_pos=xtick_pos, ytick_pos=ytick_pos, labels=labels, hregions=hregions, vregions=vregions, showgrid=showgrid, **kwargs)