Source code for hail.experimental.plots

import json
import numpy as np
import pandas as pd

import hail as hl
from bokeh.layouts import gridplot
from bokeh.models import *
from bokeh.palettes import Spectral8
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from hail.typecheck import *
from hail.utils.hadoop_utils import *


[docs]def plot_roc_curve(ht, scores, tp_label='tp', fp_label='fp', colors=None, title='ROC Curve', hover_mode='mouse'): """Create ROC curve from Hail Table. One or more `score` fields must be provided, which are assessed against `tp_label` and `fp_label` as truth data. High scores should correspond to true positives. Parameters ---------- ht : :class:`.Table` Table with required data scores : :obj:`str` or :obj:`list` of :obj:`.str` Top-level location of scores in ht against which to generate PR curves. tp_label : :obj:`str` Top-level location of true positives in ht. fp_label : :obj:`str` Top-level location of false positives in ht. colors : :obj:`dict` of :obj:`str` Optional colors to use (score -> desired color). title : :obj:`str` Title of plot. hover_mode : :obj:`str` Hover mode; one of 'mouse' (default), 'vline' or 'hline' Returns ------- :obj:`tuple` of :class:`.Figure` and :obj:`list` of :obj:`str` Figure, and list of AUCs corresponding to scores. """ if colors is None: # Get a palette automatically from bokeh.palettes import d3 palette = d3['Category10'][max(3, len(scores))] colors = {score: palette[i] for i, score in enumerate(scores)} if isinstance(scores, str): scores = [scores] total_tp, total_fp = ht.aggregate((hl.agg.count_where(ht[tp_label]), hl.agg.count_where(ht[fp_label]))) p = figure(title=title, x_axis_label='FPR', y_axis_label='TPR', tools="hover,save,pan,box_zoom,reset,wheel_zoom") p.add_layout(Title(text=f'Based on {total_tp} TPs and {total_fp} FPs'), 'above') aucs = [] for score in scores: ordered_ht = ht.key_by(_score=-ht[score]) ordered_ht = ordered_ht.select( score_name=score, score=ordered_ht[score], tpr=hl.scan.count_where(ordered_ht[tp_label]) / total_tp, fpr=hl.scan.count_where(ordered_ht[fp_label]) / total_fp, ).key_by().drop('_score') last_row = hl.utils.range_table(1).key_by().select(score_name=score, score=hl.float64(float('-inf')), tpr=hl.float32(1.0), fpr=hl.float32(1.0)) ordered_ht = ordered_ht.union(last_row) ordered_ht = ordered_ht.annotate( auc_contrib=hl.or_else((ordered_ht.fpr - hl.scan.max(ordered_ht.fpr)) * ordered_ht.tpr, 0.0) ) auc = ordered_ht.aggregate(hl.agg.sum(ordered_ht.auc_contrib)) aucs.append(auc) df = ordered_ht.annotate(score_name=ordered_ht.score_name + f' (AUC = {auc:.4f})').to_pandas() p.line(x='fpr', y='tpr', legend='score_name', source=ColumnDataSource(df), color=colors[score], line_width=3) p.legend.location = 'bottom_right' p.legend.click_policy = 'hide' p.select_one(HoverTool).tooltips = [(x, f"@{x}") for x in ('score_name', 'score', 'tpr', 'fpr')] p.select_one(HoverTool).mode = hover_mode return p, aucs
[docs]@typecheck(t_path=str) def hail_metadata(t_path): """Create a metadata plot for a Hail Table or MatrixTable. Parameters ---------- t_path : str Path to the Hail Table or MatrixTable files. Returns ------- :class:`bokeh.plotting.figure.Figure` or :class:`bokeh.models.widgets.panels.Tabs` or :class:`bokeh.models.layouts.Column` """ def get_rows_data(rows_files): file_sizes = [] partition_bounds = [] parts_file = [x['path'] for x in rows_files if x['path'].endswith('parts')] if parts_file: parts = hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x['path'].split(f'{parts_file[0]}/part-')[1].split('-')[0] if i < len(parts) - 1: test_index = parts[i + 1]['path'].split(f'{parts_file[0]}/part-')[1].split('-')[0] if test_index == index: continue file_sizes.append(x['size_bytes']) metadata_file = [x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz')] if metadata_file: with hadoop_open(metadata_file[0], 'rb') as f: rows_meta = json.loads(f.read()) try: partition_bounds = [ (x['start']['locus']['contig'], x['start']['locus']['position'], x['end']['locus']['contig'], x['end']['locus']['position']) for x in rows_meta['jRangeBounds']] except KeyError: pass return partition_bounds, file_sizes def scale_file_sizes(file_sizes): min_file_size = min(file_sizes) * 1.1 total_file_size = sum(file_sizes) all_scales = [ ('T', 1e12), ('G', 1e9), ('M', 1e6), ('K', 1e3), ('', 1e0) ] for overall_scale, overall_factor in all_scales: if total_file_size > overall_factor: total_file_size /= overall_factor break for scale, factor in all_scales: if min_file_size > factor: file_sizes = [x / factor for x in file_sizes] break total_file_size = f'{total_file_size:.1f} {overall_scale}B' return total_file_size, file_sizes, scale files = hadoop_ls(t_path) rows_file = [x['path'] for x in files if x['path'].endswith('rows')] entries_file = [x['path'] for x in files if x['path'].endswith('entries')] success_file = [x['modification_time'] for x in files if x['path'].endswith('SUCCESS')] metadata_file = [x['path'] for x in files if x['path'].endswith('metadata.json.gz')] if not metadata_file: raise FileNotFoundError('No metadata.json.gz file found.') with hadoop_open(metadata_file[0], 'rb') as f: overall_meta = json.loads(f.read()) rows_per_partition = overall_meta['components']['partition_counts']['counts'] if not rows_file: raise FileNotFoundError('No rows directory found.') rows_files = hadoop_ls(rows_file[0]) data_type = 'Table' if entries_file: data_type = 'MatrixTable' rows_file = [x['path'] for x in rows_files if x['path'].endswith('rows')] rows_files = hadoop_ls(rows_file[0]) row_partition_bounds, row_file_sizes = get_rows_data(rows_files) total_file_size, row_file_sizes, row_scale = scale_file_sizes(row_file_sizes) panel_size = 480 subpanel_size = 120 if not row_partition_bounds: warnings.warn('Table is not partitioned. Only plotting file sizes') row_file_sizes_hist, row_file_sizes_edges = np.histogram(row_file_sizes, bins=50) p_file_size = figure(plot_width=panel_size, plot_height=panel_size) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") p_file_size.yaxis.axis_label = f'File size ({row_scale}B)' return p_file_size all_data = { 'partition_widths': [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds], 'partition_bounds': [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds], 'spans_chromosome': ['Spans chromosomes' if x[0] != x[2] else 'Within chromosome' for x in row_partition_bounds], 'row_file_sizes': row_file_sizes, 'row_file_sizes_human': [f'{x:.1f} {row_scale}B' for x in row_file_sizes], 'rows_per_partition': rows_per_partition, 'index': list(range(len(rows_per_partition))) } if entries_file: entries_rows_files = hadoop_ls(entries_file[0]) entries_rows_file = [x['path'] for x in entries_rows_files if x['path'].endswith('rows')] if entries_rows_file: entries_files = hadoop_ls(entries_rows_file[0]) entry_partition_bounds, entry_file_sizes = get_rows_data(entries_files) total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes(entry_file_sizes) all_data['entry_file_sizes'] = entry_file_sizes all_data['entry_file_sizes_human'] = [f'{x:.1f} {entry_scale}B' for x in row_file_sizes] title = f'{data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>" if success_file[0]: msg += success_file[0] tools = "hover,save,pan,box_zoom,reset,wheel_zoom" source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({row_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set(all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'row_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [(x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human', 'partition_bounds', 'index')] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) rows_per_partition_hist, rows_per_partition_edges = np.histogram(all_data['rows_per_partition'], bins=50) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") row_file_sizes_hist, row_file_sizes_edges = np.histogram(all_data['row_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) if 'entry_file_sizes' in all_data: title = f'Statistics for {data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({entry_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set(all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'entry_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [(x, f'@{x}') for x in ('rows_per_partition', 'entry_file_sizes_human', 'partition_bounds', 'index')] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) row_file_sizes_hist, row_file_sizes_edges = np.histogram(all_data['entry_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") entries_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) return Tabs(tabs=[Panel(child=entries_grid, title='Entries'), Panel(child=rows_grid, title='Rows')]) else: return rows_grid