Module emblaze.utils
Expand source code
import sys
import numpy as np
from affine import Affine
from numba import jit
import json
import datetime
import platform
import os
import base64
class Field:
"""Standardized field names for embeddings and projections. These data can
all be versioned within a ColumnarData object."""
POSITION = "position"
COLOR = "color"
RADIUS = "r"
ALPHA = "alpha"
# Thumbnail fields
NAME = "name"
DESCRIPTION = "description"
class ProjectionTechnique:
"""Names of projection techniques."""
UMAP = "umap"
TSNE = "tsne"
ALIGNED_UMAP = "aligned-umap"
PCA = "pca"
class DataType:
"""Types of data, e.g. categorical vs continuous."""
CATEGORICAL = "categorical"
CONTINUOUS = "continuous"
class PreviewMode:
"""Ways of calculating preview lines."""
PROJECTION_SIMILARITY = "projectionNeighborSimilarity"
NEIGHBOR_SIMILARITY = "neighborSimilarity"
class SidebarPane:
"""Indexes of sidebar panes in the widget."""
CURRENT = 1
SAVED = 2
RECENT = 3
SUGGESTED = 4
FLIP_FACTORS = [
np.array([1, 1, 1]),
np.array([-1, 1, 1]),
np.array([1, -1, 1])
]
def projection_standardizer(emb):
"""Returns an affine transformation to translate an embedding to the centroid
of the given set of points."""
return Affine.translation(*(-emb.mean(axis=0)[:2]))
def affine_to_matrix(t):
"""
Returns a 3x3 matrix representing the transformation matrix.
"""
return np.array([
[t.a, t.b, t.c],
[t.d, t.e, t.f],
[t.g, t.h, t.i]
])
def matrix_to_affine(mat):
"""
Returns an Affine transformation object from the given 3x3 matrix.
"""
return Affine(*(mat.flatten()[:6]))
def affine_transform(transform, points):
"""
Transforms a set of N x 2 points using the given Affine object.
"""
reshaped_points = np.vstack([points.T, np.ones((1, points.shape[0]))])
transformed = np.dot(affine_to_matrix(transform), reshaped_points)
return transformed.T[:,:2] # pylint: disable=unsubscriptable-object
def standardize_json(o, round_digits=4):
"""
Produces a JSON-compliant object by replacing numpy types with system types
and rounding floats to save space.
"""
if isinstance(o, (float, np.float32, np.float64)): return round(float(o), round_digits)
if isinstance(o, (np.int64, np.int32, np.uint8)): return int(o)
if isinstance(o, dict): return {standardize_json(k, round_digits): standardize_json(v, round_digits) for k, v in o.items()}
if isinstance(o, (list, tuple)): return [standardize_json(x, round_digits) for x in o]
return o
@jit(nopython=True)
def inverse_intersection(seqs1, seqs2, mask_ids, outer):
"""
Computes the inverse intersection size of the two lists of sets.
Args:
seqs1: A list of iterables
seqs2: Another list of iterables - must be the same length as seqs1
mask_ids: Iterable containing objects that should be EXCLUDED if outer
is True, and INCLUDED if outer is False
outer: Determines the behavior of mask_ids
Returns:
A numpy array of inverse intersection sizes between each element in
seqs1 and seqs2.
"""
distances = np.zeros(len(seqs1))
mask_ids = set(mask_ids)
for i in range(len(seqs1)):
set1 = set([n for n in seqs1[i] if (n in mask_ids) != outer])
set2 = set([n for n in seqs2[i] if (n in mask_ids) != outer])
num_intersection = len(set1 & set2)
if len(set1) or len(set2):
distances[i] = 1 / (1 + num_intersection)
return distances
class LoggingHelper:
"""
Writes and/or updates a JSON file with interaction information.
"""
def __init__(self, filepath, addl_info=None):
super().__init__()
self.filepath = filepath
if not os.path.exists(self.filepath):
current_data = {
"timestamp": str(datetime.datetime.now()),
"platform": platform.platform(),
"version": sys.version,
"logs": []
}
if addl_info is not None:
current_data.update(addl_info)
with open(self.filepath, "w") as file:
json.dump(current_data, file)
def add_logs(self, entries):
"""
Adds a list of logging entries to the log file.
"""
with open(self.filepath, "r") as file:
current_data = json.load(file)
current_data["logs"] += entries
with open(self.filepath, "w") as file:
json.dump(current_data, file)
def choose_integer_type(values):
"""
Chooses the best integer type (i.e. np.(u)int(8|16|32)) for the given set
of values. Returns the dtype and its name.
"""
min_val = values.min()
max_val = values.max()
rng = max_val - min_val
if min_val < 0:
if rng > 2 ** 32 - 1:
return np.int64, "i8"
elif rng > 2 ** 16 - 1:
return np.int32, "i4"
elif rng > 2 ** 8 - 1:
return np.int16, "i2"
return np.int8, "i1"
elif rng > 2 ** 32 - 1:
return np.uint64, "u8"
elif rng > 2 ** 16 - 1:
return np.uint32, "u4"
elif rng > 2 ** 8 - 1:
return np.uint16, "u2"
return np.uint8, "u1"
def _detect_numerical_sequence(arr):
"""
Detects a numerical sequence to compress large arrays of integer IDs when
they are regularly spaced. If a sequence is detected, returns the start, end,
and step such that using np.arange() with these three arguments yields the
appropriate result. If no sequence is detected, returns None.
"""
diffs = arr[1:] - arr[:-1]
if np.allclose(diffs, diffs[0]):
step = diffs[0]
return (arr[0], arr[-1] + step, step)
return None
def encode_numerical_array(arr, astype=np.float32, positions=None, interval=None):
"""
Encodes the given numpy array into a base64 representation for fast transfer
to the widget frontend. The array will be encoded as a sequence of numbers
with type 'astype'.
If positions is not None, it should be a numpy array of positions at which the
array for each ID *ends*. For example, if there are ten IDs and ten numbers
in the array for each ID, the positions array would be [10, 20, ..., 90, 100].
If interval is not None, it is passed into the result object directly (and
signifies the same as positions, but with a regularly spaced interval).
"""
# TODO support saving arrays as numerical sequence metadata
# sequence_info = _detect_numerical_sequence(arr)
# if sequence_info is not None:
# result = { ""}
if not arr.flags['C_CONTIGUOUS']:
arr = arr.copy(order='C')
result = { "values": base64.b64encode(arr.astype(astype)).decode('ascii') }
if positions is not None:
result["positions"] = base64.b64encode(positions.astype(np.int32)).decode('ascii')
if interval is not None:
result["interval"] = interval
return result
def encode_object_array(arr):
"""
Encodes the given array as a base64 string of a JSON string.
"""
if isinstance(arr, np.ndarray):
arr = arr.tolist()
return { "values": base64.b64encode(json.dumps(standardize_json(arr)).encode("utf-8")).decode('ascii') }
def decode_numerical_array(obj, astype=np.float32):
"""
Decodes the given compressed dict into an array of the given dtype. The
dict should contain a 'values' key (base64 string) and optionally a
'positions' key (base64 string to be turned into an int32 array, defining
the shape of a 2d matrix) or an 'interval' key (integer defining the number
of columns in the 2d matrix).
"""
values = np.frombuffer(base64.decodebytes(obj["values"].encode('ascii')), dtype=astype)
if "positions" in obj:
positions = np.frombuffer(base64.decodebytes(obj["positions"].encode('ascii')), dtype=np.int32)
deltas = positions[1:] - positions[:-1]
assert np.allclose(deltas, deltas[0]), "cannot currently decode numerical arrays with non-standard positions array"
values = values.reshape(-1, deltas[0])
elif "interval" in obj:
values = values.reshape(-1, obj["interval"])
return values
def decode_object_array(obj):
"""
Decodes the given object's 'values' key into a JSON object.
"""
return json.loads(base64.b64decode(obj["values"].encode('ascii')))
Functions
def affine_to_matrix(t)
-
Returns a 3x3 matrix representing the transformation matrix.
Expand source code
def affine_to_matrix(t): """ Returns a 3x3 matrix representing the transformation matrix. """ return np.array([ [t.a, t.b, t.c], [t.d, t.e, t.f], [t.g, t.h, t.i] ])
def affine_transform(transform, points)
-
Transforms a set of N x 2 points using the given Affine object.
Expand source code
def affine_transform(transform, points): """ Transforms a set of N x 2 points using the given Affine object. """ reshaped_points = np.vstack([points.T, np.ones((1, points.shape[0]))]) transformed = np.dot(affine_to_matrix(transform), reshaped_points) return transformed.T[:,:2] # pylint: disable=unsubscriptable-object
def choose_integer_type(values)
-
Chooses the best integer type (i.e. np.(u)int(8|16|32)) for the given set of values. Returns the dtype and its name.
Expand source code
def choose_integer_type(values): """ Chooses the best integer type (i.e. np.(u)int(8|16|32)) for the given set of values. Returns the dtype and its name. """ min_val = values.min() max_val = values.max() rng = max_val - min_val if min_val < 0: if rng > 2 ** 32 - 1: return np.int64, "i8" elif rng > 2 ** 16 - 1: return np.int32, "i4" elif rng > 2 ** 8 - 1: return np.int16, "i2" return np.int8, "i1" elif rng > 2 ** 32 - 1: return np.uint64, "u8" elif rng > 2 ** 16 - 1: return np.uint32, "u4" elif rng > 2 ** 8 - 1: return np.uint16, "u2" return np.uint8, "u1"
def decode_numerical_array(obj, astype=numpy.float32)
-
Decodes the given compressed dict into an array of the given dtype. The dict should contain a 'values' key (base64 string) and optionally a 'positions' key (base64 string to be turned into an int32 array, defining the shape of a 2d matrix) or an 'interval' key (integer defining the number of columns in the 2d matrix).
Expand source code
def decode_numerical_array(obj, astype=np.float32): """ Decodes the given compressed dict into an array of the given dtype. The dict should contain a 'values' key (base64 string) and optionally a 'positions' key (base64 string to be turned into an int32 array, defining the shape of a 2d matrix) or an 'interval' key (integer defining the number of columns in the 2d matrix). """ values = np.frombuffer(base64.decodebytes(obj["values"].encode('ascii')), dtype=astype) if "positions" in obj: positions = np.frombuffer(base64.decodebytes(obj["positions"].encode('ascii')), dtype=np.int32) deltas = positions[1:] - positions[:-1] assert np.allclose(deltas, deltas[0]), "cannot currently decode numerical arrays with non-standard positions array" values = values.reshape(-1, deltas[0]) elif "interval" in obj: values = values.reshape(-1, obj["interval"]) return values
def decode_object_array(obj)
-
Decodes the given object's 'values' key into a JSON object.
Expand source code
def decode_object_array(obj): """ Decodes the given object's 'values' key into a JSON object. """ return json.loads(base64.b64decode(obj["values"].encode('ascii')))
def encode_numerical_array(arr, astype=numpy.float32, positions=None, interval=None)
-
Encodes the given numpy array into a base64 representation for fast transfer to the widget frontend. The array will be encoded as a sequence of numbers with type 'astype'.
If positions is not None, it should be a numpy array of positions at which the array for each ID ends. For example, if there are ten IDs and ten numbers in the array for each ID, the positions array would be [10, 20, …, 90, 100].
If interval is not None, it is passed into the result object directly (and signifies the same as positions, but with a regularly spaced interval).
Expand source code
def encode_numerical_array(arr, astype=np.float32, positions=None, interval=None): """ Encodes the given numpy array into a base64 representation for fast transfer to the widget frontend. The array will be encoded as a sequence of numbers with type 'astype'. If positions is not None, it should be a numpy array of positions at which the array for each ID *ends*. For example, if there are ten IDs and ten numbers in the array for each ID, the positions array would be [10, 20, ..., 90, 100]. If interval is not None, it is passed into the result object directly (and signifies the same as positions, but with a regularly spaced interval). """ # TODO support saving arrays as numerical sequence metadata # sequence_info = _detect_numerical_sequence(arr) # if sequence_info is not None: # result = { ""} if not arr.flags['C_CONTIGUOUS']: arr = arr.copy(order='C') result = { "values": base64.b64encode(arr.astype(astype)).decode('ascii') } if positions is not None: result["positions"] = base64.b64encode(positions.astype(np.int32)).decode('ascii') if interval is not None: result["interval"] = interval return result
def encode_object_array(arr)
-
Encodes the given array as a base64 string of a JSON string.
Expand source code
def encode_object_array(arr): """ Encodes the given array as a base64 string of a JSON string. """ if isinstance(arr, np.ndarray): arr = arr.tolist() return { "values": base64.b64encode(json.dumps(standardize_json(arr)).encode("utf-8")).decode('ascii') }
def inverse_intersection(seqs1, seqs2, mask_ids, outer)
-
Computes the inverse intersection size of the two lists of sets.
Args
seqs1
- A list of iterables
seqs2
- Another list of iterables - must be the same length as seqs1
mask_ids
- Iterable containing objects that should be EXCLUDED if outer is True, and INCLUDED if outer is False
outer
- Determines the behavior of mask_ids
Returns
A numpy array of inverse intersection sizes between each element in seqs1 and seqs2.
Expand source code
@jit(nopython=True) def inverse_intersection(seqs1, seqs2, mask_ids, outer): """ Computes the inverse intersection size of the two lists of sets. Args: seqs1: A list of iterables seqs2: Another list of iterables - must be the same length as seqs1 mask_ids: Iterable containing objects that should be EXCLUDED if outer is True, and INCLUDED if outer is False outer: Determines the behavior of mask_ids Returns: A numpy array of inverse intersection sizes between each element in seqs1 and seqs2. """ distances = np.zeros(len(seqs1)) mask_ids = set(mask_ids) for i in range(len(seqs1)): set1 = set([n for n in seqs1[i] if (n in mask_ids) != outer]) set2 = set([n for n in seqs2[i] if (n in mask_ids) != outer]) num_intersection = len(set1 & set2) if len(set1) or len(set2): distances[i] = 1 / (1 + num_intersection) return distances
def matrix_to_affine(mat)
-
Returns an Affine transformation object from the given 3x3 matrix.
Expand source code
def matrix_to_affine(mat): """ Returns an Affine transformation object from the given 3x3 matrix. """ return Affine(*(mat.flatten()[:6]))
def projection_standardizer(emb)
-
Returns an affine transformation to translate an embedding to the centroid of the given set of points.
Expand source code
def projection_standardizer(emb): """Returns an affine transformation to translate an embedding to the centroid of the given set of points.""" return Affine.translation(*(-emb.mean(axis=0)[:2]))
def standardize_json(o, round_digits=4)
-
Produces a JSON-compliant object by replacing numpy types with system types and rounding floats to save space.
Expand source code
def standardize_json(o, round_digits=4): """ Produces a JSON-compliant object by replacing numpy types with system types and rounding floats to save space. """ if isinstance(o, (float, np.float32, np.float64)): return round(float(o), round_digits) if isinstance(o, (np.int64, np.int32, np.uint8)): return int(o) if isinstance(o, dict): return {standardize_json(k, round_digits): standardize_json(v, round_digits) for k, v in o.items()} if isinstance(o, (list, tuple)): return [standardize_json(x, round_digits) for x in o] return o
Classes
class DataType
-
Types of data, e.g. categorical vs continuous.
Expand source code
class DataType: """Types of data, e.g. categorical vs continuous.""" CATEGORICAL = "categorical" CONTINUOUS = "continuous"
Class variables
var CATEGORICAL
var CONTINUOUS
class Field
-
Standardized field names for embeddings and projections. These data can all be versioned within a ColumnarData object.
Expand source code
class Field: """Standardized field names for embeddings and projections. These data can all be versioned within a ColumnarData object.""" POSITION = "position" COLOR = "color" RADIUS = "r" ALPHA = "alpha" # Thumbnail fields NAME = "name" DESCRIPTION = "description"
Class variables
var ALPHA
var COLOR
var DESCRIPTION
var NAME
var POSITION
var RADIUS
class PreviewMode
-
Ways of calculating preview lines.
Expand source code
class PreviewMode: """Ways of calculating preview lines.""" PROJECTION_SIMILARITY = "projectionNeighborSimilarity" NEIGHBOR_SIMILARITY = "neighborSimilarity"
Class variables
var NEIGHBOR_SIMILARITY
var PROJECTION_SIMILARITY
class ProjectionTechnique
-
Names of projection techniques.
Expand source code
class ProjectionTechnique: """Names of projection techniques.""" UMAP = "umap" TSNE = "tsne" ALIGNED_UMAP = "aligned-umap" PCA = "pca"
Class variables
var ALIGNED_UMAP
var PCA
var TSNE
var UMAP
class SidebarPane
-
Indexes of sidebar panes in the widget.
Expand source code
class SidebarPane: """Indexes of sidebar panes in the widget.""" CURRENT = 1 SAVED = 2 RECENT = 3 SUGGESTED = 4
Class variables
var CURRENT
var RECENT
var SAVED
var SUGGESTED