Source code for glue.core.component

from __future__ import absolute_import, division, print_function

import logging

import numpy as np
import pandas as pd

from glue.utils import (shape_to_string, coerce_numeric,
                        broadcast_to, categorical_ndarray)


__all__ = ['Component', 'DerivedComponent', 'CategoricalComponent',
           'CoordinateComponent', 'DateTimeComponent']


[docs]class Component(object): """ Stores the actual, numerical information for a particular quantity Data objects hold one or more components, accessed via ComponentIDs. All Components in a data set must have the same shape and number of dimensions Notes ----- Instead of instantiating Components directly, consider using :meth:`Component.autotyped`, which chooses a subclass most appropriate for the data type. """ def __init__(self, data, units=None): """ :param data: The data to store :type data: :class:`numpy.ndarray` :param units: Optional unit label :type units: str """ # The physical units of the data self.units = units # The actual data # subclasses may pass non-arrays here as placeholders. if isinstance(data, np.ndarray): if data.dtype.kind == 'M': raise TypeError('DateTimeComponent should be used instead of Component for np.datetime64 arrays') data = coerce_numeric(data) data.setflags(write=False) # data is read-only self._data = data @property def units(self): return self._units
[docs] @units.setter def units(self, value): if value is None: self._units = '' else: self._units = str(value)
[docs] @property def data(self): """ The underlying :class:`numpy.ndarray` """ return self._data
[docs] @property def shape(self): """ Tuple of array dimensions """ return self._data.shape
[docs] @property def ndim(self): """ The number of dimensions """ return len(self._data.shape)
def __getitem__(self, key): logging.debug("Using %s to index data of shape %s", key, self.shape) return self._data[key]
[docs] @property def numeric(self): """ Whether or not the datatype is numeric """ # We need to be careful here to not just access self.data since that # would force the computation of the whole component in the case of # derived components, so instead we specifically only get the first # element. return np.can_cast(self[(0,) * self.ndim].dtype, np.complex)
[docs] @property def categorical(self): """ Whether or not the datatype is categorical """ return False
[docs] @property def datetime(self): """ Whether or not or not the datatype is a date/time """ return False
def __str__(self): return "%s with shape %s" % (self.__class__.__name__, shape_to_string(self.shape))
[docs] def jitter(self, method=None): raise NotImplementedError
[docs] def to_series(self, **kwargs): """ Convert into a pandas.Series object. :param kwargs: All kwargs are passed to the Series constructor. :return: pandas.Series """ return pd.Series(self.data.ravel(), **kwargs)
[docs] @classmethod def autotyped(cls, data, units=None): """ Automatically choose between Component and CategoricalComponent, based on the input data type. :param data: The data to pack into a Component (array-like) :param units: Optional units :type units: str :returns: A Component (or subclass) """ data = np.asarray(data) if np.issubdtype(data.dtype, np.object_): return CategoricalComponent(data, units=units) if data.dtype.kind == 'M': return DateTimeComponent(data) n = coerce_numeric(data) thresh = 0.5 try: use_categorical = np.issubdtype(data.dtype, np.character) and \ np.isfinite(n).mean() <= thresh except TypeError: # isfinite not supported. non-numeric dtype use_categorical = True if use_categorical: return CategoricalComponent(data, units=units) else: return Component(n, units=units)
[docs]class DerivedComponent(Component): """ A component which derives its data from a function """ def __init__(self, data, link, units=None): """ :param data: The data object to use for calculation :type data: :class:`~glue.core.data.Data` :param link: The link that carries out the function :type link: :class:`~glue.core.component_link.ComponentLink` :param units: Optional unit description """ super(DerivedComponent, self).__init__(data, units=units) self._link = link
[docs] def set_parent(self, data): """ Reassign the Data object that this DerivedComponent operates on """ self._data = data
[docs] @property def data(self): """ Return the numerical data as a numpy array """ return self._link.compute(self._data)
def __getitem__(self, key): return self._link.compute(self._data, key)
[docs]class CoordinateComponent(Component): """ Components associated with pixel or world coordinates The numerical values are computed on the fly. """ def __init__(self, data, axis, world=False): super(CoordinateComponent, self).__init__(None, None) self.world = world self._data = data self.axis = axis
[docs] @property def data(self): return self._calculate()
def _calculate(self, view=None): if self.world: # Calculating the world coordinates can be a bottleneck if we aren't # careful, so we need to make sure that if not all dimensions depend # on each other, we use smart broadcasting. # The unoptimized way to do this for an N-dimensional dataset would # be to construct N-dimensional arrays of pixel values for each # coordinate. However, if we are computing the coordinates for axis # i, and axis i is not dependent on any other axis, then the result # will be an N-dimensional array where the same 1D array of # coordinates will be repeated over and over. # To optimize this, we therefore essentially consider only the # dependent dimensions and then broacast the result to the full # array size at the very end. # view=None actually adds a dimension which is never what we really # mean, at least in glue. if view is None: view = Ellipsis # If the view is a tuple or list of arrays, we should actually just # convert these straight to world coordinates since the indices # of the pixel coordinates are the pixel coordinates themselves. if isinstance(view, (tuple, list)) and isinstance(view[0], np.ndarray): axis = self._data.ndim - 1 - self.axis return self._data.coords.pixel2world_single_axis(*view[::-1], axis=axis) # For 1D arrays, slice can be given as a single slice but we need # to wrap it in a list to make the following code work correctly, # as it is then consistent with higher-dimensional cases. if isinstance(view, slice) or np.isscalar(view): view = [view] # Some views, e.g. with lists of integer arrays, can give arbitrarily # complex (copied) subsets of arrays, so in this case we don't do any # optimization if view is Ellipsis: optimize_view = False else: for v in view: if not np.isscalar(v) and not isinstance(v, slice): optimize_view = False break else: optimize_view = True pix_coords = [] dep_coords = self._data.coords.dependent_axes(self.axis) final_slice = [] final_shape = [] for i in range(self._data.ndim): if optimize_view and i < len(view) and np.isscalar(view[i]): final_slice.append(0) else: final_slice.append(slice(None)) # We set up a 1D pixel axis along that dimension. pix_coord = np.arange(self._data.shape[i]) # If a view was specified, we need to take it into account for # that axis. if optimize_view and i < len(view): pix_coord = pix_coord[view[i]] if not np.isscalar(view[i]): final_shape.append(len(pix_coord)) else: final_shape.append(self._data.shape[i]) if i not in dep_coords: # The axis is not dependent on this instance's axis, so we # just compute the values once and broadcast along this # dimension later. pix_coord = 0 pix_coords.append(pix_coord) # We build the list of N arrays, one for each pixel coordinate pix_coords = np.meshgrid(*pix_coords, indexing='ij', copy=False) # Finally we convert these to world coordinates axis = self._data.ndim - 1 - self.axis world_coords = self._data.coords.pixel2world_single_axis(*pix_coords[::-1], axis=axis) # We get rid of any dimension for which using the view should get # rid of that dimension. if optimize_view: world_coords = world_coords[tuple(final_slice)] # We then broadcast the final array back to what it should be world_coords = broadcast_to(world_coords, tuple(final_shape)) # We apply the view if we weren't able to optimize before if optimize_view: return world_coords else: return world_coords[view] else: slices = [slice(0, s, 1) for s in self.shape] grids = np.broadcast_arrays(*np.ogrid[slices]) if view is not None: grids = [g[view] for g in grids] return grids[self.axis]
[docs] @property def shape(self): """ Tuple of array dimensions. """ return self._data.shape
[docs] @property def ndim(self): """ Number of dimensions """ return len(self._data.shape)
def __getitem__(self, key): return self._calculate(key) def __lt__(self, other): if self.world == other.world: return self.axis < other.axis return self.world def __gluestate__(self, context): return dict(axis=self.axis, world=self.world) @classmethod def __setgluestate__(cls, rec, context): return cls(None, rec['axis'], rec['world'])
[docs] @property def numeric(self): return True
[docs] @property def categorical(self): return False
[docs]class CategoricalComponent(Component): """ Container for categorical data. """ def __init__(self, categorical_data, categories=None, jitter=None, units=None): """ :param categorical_data: The underlying :class:`numpy.ndarray` :param categories: List of unique values in the data :jitter: Strategy for jittering the data """ # TOOD: deal with custom categories super(CategoricalComponent, self).__init__(None, units) self._data = categorical_ndarray(categorical_data, copy=False, categories=categories) if self._data.ndim != 1: raise ValueError("Categorical Data must be 1-dimensional") self.jitter(method=jitter)
[docs] @property def codes(self): """ The index of the category for each value in the array. """ return self._data.codes
[docs] @property def labels(self): """ The original categorical data. """ return self._data.view(np.ndarray)
[docs] @property def categories(self): """ The categories. """ return self._data.categories
[docs] @property def data(self): return self._data
[docs] @property def numeric(self): return False
[docs] @property def categorical(self): return True
[docs] def jitter(self, method=None): """ Jitter the codes so the density of points can be easily seen in a scatter plot for example. Parameters ---------- method : {None, 'uniform'} If `None`, not jittering is done (or any jittering is undone). If ``'uniform'``, the codes are randomized by a uniformly distributed random variable. """ self._data.jitter(method=method) self.jitter_method = method
[docs] def to_series(self, **kwargs): """ Convert into a pandas.Series object. This will be converted as a dtype=np.object! :param kwargs: All kwargs are passed to the Series constructor. :return: pandas.Series """ return pd.Series(self.labels, dtype=np.object, **kwargs)
[docs]class DateTimeComponent(Component): """ A component representing a date/time. Parameters ---------- data : `~numpy.ndarray` The data to store, with `~numpy.datetime64` dtype """ def __init__(self, data, units=None): self.units = units if not isinstance(data, np.ndarray) or data.dtype.kind != 'M': raise TypeError("DateTimeComponent should be initialized with a datetim64 Numpy array") self._data = data
[docs] @property def numeric(self): return True
[docs] @property def datetime(self): return True