"""
Tabular datatype
"""
import pkg_resources
pkg_resources.require( "bx-python" )
import gzip
import logging
import os
from cgi import escape
from galaxy import util
from galaxy.datatypes import data
from galaxy.datatypes import metadata
from galaxy.datatypes.checkers import is_gzip
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.sniff import get_headers, get_test_fname
from galaxy.util.json import dumps
import dataproviders
log = logging.getLogger(__name__)
@dataproviders.decorators.has_dataproviders
[docs]class Tabular( data.Text ):
"""Tab delimited data"""
# All tabular data is chunkable.
CHUNKABLE = True
"""Add metadata elements"""
MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
MetadataElement( name="column_names", default=[], desc="Column names", readonly=True, visible=False, optional=True, no_value=[] )
[docs] def make_html_table( self, dataset, **kwargs ):
"""Create HTML table, used for displaying peek"""
out = ['<table cellspacing="0" cellpadding="3">']
try:
out.append( self.make_html_peek_header( dataset, **kwargs ) )
out.append( self.make_html_peek_rows( dataset, **kwargs ) )
out.append( '</table>' )
out = "".join( out )
except Exception, exc:
out = "Can't create peek %s" % str( exc )
return out
[docs] def make_html_peek_rows( self, dataset, skipchars=None, **kwargs ):
if skipchars is None:
skipchars = []
out = []
try:
if not dataset.peek:
dataset.set_peek()
columns = dataset.metadata.columns
if columns is None:
columns = dataset.metadata.spec.columns.no_value
for line in dataset.peek.splitlines():
if line.startswith( tuple( skipchars ) ):
out.append( '<tr><td colspan="100%%">%s</td></tr>' % escape( line ) )
elif line:
elems = line.split( '\t' )
# we may have an invalid comment line or invalid data
if len( elems ) != columns:
out.append( '<tr><td colspan="100%%">%s</td></tr>' % escape( line ) )
else:
out.append( '<tr>' )
for elem in elems:
out.append( '<td>%s</td>' % escape( elem ) )
out.append( '</tr>' )
except Exception, exc:
log.exception( 'make_html_peek_rows failed on HDA %s' % dataset.id )
raise Exception, "Can't create peek rows %s" % str( exc )
return "".join( out )
[docs] def get_chunk(self, trans, dataset, chunk):
ck_index = int(chunk)
f = open(dataset.file_name)
f.seek(ck_index * trans.app.config.display_chunk_size)
# If we aren't at the start of the file, seek to next newline. Do this better eventually.
if f.tell() != 0:
cursor = f.read(1)
while cursor and cursor != '\n':
cursor = f.read(1)
ck_data = f.read(trans.app.config.display_chunk_size)
cursor = f.read(1)
while cursor and ck_data[-1] != '\n':
ck_data += cursor
cursor = f.read(1)
return dumps( { 'ck_data': util.unicodify( ck_data ), 'ck_index': ck_index + 1 } )
[docs] def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None, chunk=None, **kwd):
preview = util.string_as_bool( preview )
if chunk:
return self.get_chunk(trans, dataset, chunk)
elif to_ext or not preview:
to_ext = to_ext or dataset.extension
return self._serve_raw(trans, dataset, to_ext)
elif dataset.metadata.columns > 50:
#Fancy tabular display is only suitable for datasets without an incredibly large number of columns.
#We should add a new datatype 'matrix', with its own draw method, suitable for this kind of data.
#For now, default to the old behavior, ugly as it is. Remove this after adding 'matrix'.
max_peek_size = 1000000 # 1 MB
if os.stat( dataset.file_name ).st_size < max_peek_size:
return open( dataset.file_name )
else:
trans.response.set_content_type( "text/html" )
return trans.stream_template_mako( "/dataset/large_file.mako",
truncated_data = open( dataset.file_name ).read(max_peek_size),
data = dataset)
else:
column_names = 'null'
if dataset.metadata.column_names:
column_names = dataset.metadata.column_names
elif hasattr(dataset.datatype, 'column_names'):
column_names = dataset.datatype.column_names
column_types = dataset.metadata.column_types
if not column_types:
column_types = []
column_number = dataset.metadata.columns
if column_number is None:
column_number = 'null'
return trans.fill_template( "/dataset/tabular_chunked.mako",
dataset = dataset,
chunk = self.get_chunk(trans, dataset, 0),
column_number = column_number,
column_names = column_names,
column_types = column_types )
[docs] def set_peek( self, dataset, line_count=None, is_multi_byte=False):
super(Tabular, self).set_peek( dataset, line_count=line_count, is_multi_byte=is_multi_byte)
if dataset.metadata.comment_lines:
dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) )
[docs] def display_peek( self, dataset ):
"""Returns formatted html of peek"""
return self.make_html_table( dataset )
[docs] def displayable( self, dataset ):
try:
return dataset.has_data() \
and dataset.state == dataset.states.OK \
and dataset.metadata.columns > 0 \
and dataset.metadata.data_lines != 0
except:
return False
[docs] def as_gbrowse_display_file( self, dataset, **kwd ):
return open( dataset.file_name )
[docs] def as_ucsc_display_file( self, dataset, **kwd ):
return open( dataset.file_name )
# ------------- Dataproviders
@dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings )
[docs] def column_dataprovider( self, dataset, **settings ):
"""Uses column settings that are passed in"""
dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
return dataproviders.column.ColumnarDataProvider( dataset_source, **settings )
@dataproviders.decorators.dataprovider_factory( 'dataset-column',
dataproviders.column.ColumnarDataProvider.settings )
[docs] def dataset_column_dataprovider( self, dataset, **settings ):
"""Attempts to get column settings from dataset.metadata"""
return dataproviders.dataset.DatasetColumnarDataProvider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings )
[docs] def dict_dataprovider( self, dataset, **settings ):
"""Uses column settings that are passed in"""
dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
return dataproviders.column.DictDataProvider( dataset_source, **settings )
@dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings )
[docs] def dataset_dict_dataprovider( self, dataset, **settings ):
"""Attempts to get column settings from dataset.metadata"""
return dataproviders.dataset.DatasetDictDataProvider( dataset, **settings )
[docs]class Taxonomy( Tabular ):
def __init__(self, **kwd):
"""Initialize taxonomy datatype"""
Tabular.__init__( self, **kwd )
self.column_names = ['Name', 'TaxId', 'Root', 'Superkingdom', 'Kingdom', 'Subkingdom',
'Superphylum', 'Phylum', 'Subphylum', 'Superclass', 'Class', 'Subclass',
'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
]
[docs] def display_peek( self, dataset ):
"""Returns formated html of peek"""
return Tabular.make_html_table( self, dataset, column_names=self.column_names )
@dataproviders.decorators.has_dataproviders
[docs]class Sam( Tabular ):
file_ext = 'sam'
track_type = "ReadTrack"
data_sources = { "data": "bam", "index": "bigwig" }
def __init__(self, **kwd):
"""Initialize taxonomy datatype"""
Tabular.__init__( self, **kwd )
self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR',
'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT'
]
[docs] def display_peek( self, dataset ):
"""Returns formated html of peek"""
return Tabular.make_html_table( self, dataset, column_names=self.column_names )
[docs] def sniff( self, filename ):
"""
Determines whether the file is in SAM format
A file in SAM format consists of lines of tab-separated data.
The following header line may be the first line::
@QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL
or
@QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL OPT
Data in the OPT column is optional and can consist of tab-separated data
For complete details see http://samtools.sourceforge.net/SAM1.pdf
Rules for sniffing as True::
There must be 11 or more columns of data on each line
Columns 2 (FLAG), 4(POS), 5 (MAPQ), 8 (MPOS), and 9 (ISIZE) must be numbers (9 can be negative)
We will only check that up to the first 5 alignments are correctly formatted.
>>> fname = get_test_fname( 'sequence.maf' )
>>> Sam().sniff( fname )
False
>>> fname = get_test_fname( '1.sam' )
>>> Sam().sniff( fname )
True
"""
try:
fh = open( filename )
count = 0
while True:
line = fh.readline()
line = line.strip()
if not line:
break #EOF
if line:
if line[0] != '@':
linePieces = line.split('\t')
if len(linePieces) < 11:
return False
try:
check = int(linePieces[1])
check = int(linePieces[3])
check = int(linePieces[4])
check = int(linePieces[7])
check = int(linePieces[8])
except ValueError:
return False
count += 1
if count == 5:
return True
fh.close()
if count < 5 and count > 0:
return True
except:
pass
return False
[docs] def merge( split_files, output_file):
"""
Multiple SAM files may each have headers. Since the headers should all be the same, remove
the headers from files 1-n, keeping them in the first file only
"""
cmd = 'mv %s %s' % ( split_files[0], output_file )
result = os.system(cmd)
if result != 0:
raise Exception('Result %s from %s' % (result, cmd))
if len(split_files) > 1:
cmd = 'egrep -v -h "^@" %s >> %s' % ( ' '.join(split_files[1:]), output_file )
result = os.system(cmd)
if result != 0:
raise Exception('Result %s from %s' % (result, cmd))
merge = staticmethod(merge)
# ------------- Dataproviders
# sam does not use '#' to indicate comments/headers - we need to strip out those headers from the std. providers
#TODO:?? seems like there should be an easier way to do this - metadata.comment_char?
@dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings )
[docs] def line_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return super( Sam, self ).line_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings )
[docs] def regex_line_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return super( Sam, self ).regex_line_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings )
[docs] def column_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return super( Sam, self ).column_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'dataset-column',
dataproviders.column.ColumnarDataProvider.settings )
[docs] def dataset_column_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return super( Sam, self ).dataset_column_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings )
[docs] def dict_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return super( Sam, self ).dict_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings )
[docs] def dataset_dict_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return super( Sam, self ).dataset_dict_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'header', dataproviders.line.RegexLineDataProvider.settings )
@dataproviders.decorators.dataprovider_factory( 'id-seq-qual', dict_dataprovider.settings )
[docs] def id_seq_qual_dataprovider( self, dataset, **settings ):
# provided as an example of a specified column dict (w/o metadata)
settings[ 'indeces' ] = [ 0, 9, 10 ]
settings[ 'column_names' ] = [ 'id', 'seq', 'qual' ]
return self.dict_dataprovider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'genomic-region',
dataproviders.dataset.GenomicRegionDataProvider.settings )
[docs] def genomic_region_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return dataproviders.dataset.GenomicRegionDataProvider( dataset, 2, 3, 3, **settings )
@dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
dataproviders.dataset.GenomicRegionDataProvider.settings )
[docs] def genomic_region_dict_dataprovider( self, dataset, **settings ):
settings[ 'comment_char' ] = '@'
return dataproviders.dataset.GenomicRegionDataProvider( dataset, 2, 3, 3, True, **settings )
#@dataproviders.decorators.dataprovider_factory( 'samtools' )
#def samtools_dataprovider( self, dataset, **settings ):
# dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
# return dataproviders.dataset.SamtoolsDataProvider( dataset_source, **settings )
@dataproviders.decorators.has_dataproviders
[docs]class Pileup( Tabular ):
"""Tab delimited data in pileup (6- or 10-column) format"""
file_ext = "pileup"
line_class = "genomic coordinate"
data_sources = { "data": "tabix" }
"""Add metadata elements"""
MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
MetadataElement( name="endCol", default=2, desc="End column", param=metadata.ColumnParameter )
MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter )
[docs] def display_peek( self, dataset ):
"""Returns formated html of peek"""
return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'baseCol':'Base'} )
[docs] def repair_methods( self, dataset ):
"""Return options for removing errors along with a description"""
return [ ("lines", "Remove erroneous lines") ]
[docs] def sniff( self, filename ):
"""
Checks for 'pileup-ness'
There are two main types of pileup: 6-column and 10-column. For both,
the first three and last two columns are the same. We only check the
first three to allow for some personalization of the format.
>>> fname = get_test_fname( 'interval.interval' )
>>> Pileup().sniff( fname )
False
>>> fname = get_test_fname( '6col.pileup' )
>>> Pileup().sniff( fname )
True
>>> fname = get_test_fname( '10col.pileup' )
>>> Pileup().sniff( fname )
True
"""
headers = get_headers( filename, '\t' )
try:
for hdr in headers:
if hdr and not hdr[0].startswith( '#' ):
if len( hdr ) < 3:
return False
try:
# chrom start in column 1 (with 0-based columns)
# and reference base is in column 2
check = int( hdr[1] )
assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
except:
return False
return True
except:
return False
# ------------- Dataproviders
@dataproviders.decorators.dataprovider_factory( 'genomic-region',
dataproviders.dataset.GenomicRegionDataProvider.settings )
[docs] def genomic_region_dataprovider( self, dataset, **settings ):
return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings )
@dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
dataproviders.dataset.GenomicRegionDataProvider.settings )
[docs] def genomic_region_dict_dataprovider( self, dataset, **settings ):
settings[ 'named_columns' ] = True
return self.genomic_region_dataprovider( dataset, **settings )
@dataproviders.decorators.has_dataproviders
[docs]class Vcf( Tabular ):
""" Variant Call Format for describing SNPs and other simple genome variations. """
track_type = "VariantTrack"
data_sources = { "data": "tabix", "index": "bigwig" }
file_ext = 'vcf'
column_names = [ 'Chrom', 'Pos', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info', 'Format', 'data' ]
MetadataElement( name="columns", default=10, desc="Number of columns", readonly=True, visible=False )
MetadataElement( name="column_types", default=['str','int','str','str','str','int','str','list','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
MetadataElement( name="viz_filter_cols", desc="Score column for visualization", default=[5], param=metadata.ColumnParameter, optional=True, multiple=True, visible=False )
MetadataElement( name="sample_names", default=[], desc="Sample names", readonly=True, visible=False, optional=True, no_value=[] )
[docs] def sniff( self, filename ):
headers = get_headers( filename, '\n', count=1 )
return headers[0][0].startswith("##fileformat=VCF")
[docs] def display_peek( self, dataset ):
"""Returns formated html of peek"""
return Tabular.make_html_table( self, dataset, column_names=self.column_names )
@dataproviders.decorators.dataprovider_factory( 'genomic-region',
dataproviders.dataset.GenomicRegionDataProvider.settings )
[docs] def genomic_region_dataprovider( self, dataset, **settings ):
return dataproviders.dataset.GenomicRegionDataProvider( dataset, 0, 1, 1, **settings )
@dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
dataproviders.dataset.GenomicRegionDataProvider.settings )
[docs] def genomic_region_dict_dataprovider( self, dataset, **settings ):
settings[ 'named_columns' ] = True
return self.genomic_region_dataprovider( dataset, **settings )
[docs]class Eland( Tabular ):
"""Support for the export.txt.gz file used by Illumina's ELANDv2e aligner"""
file_ext = '_export.txt.gz'
MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False )
MetadataElement( name="column_types", default=[], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False, no_value=[] )
MetadataElement( name="comment_lines", default=0, desc="Number of comments", readonly=True, visible=False )
MetadataElement( name="tiles", default=[], param=metadata.ListParameter, desc="Set of tiles", readonly=True, visible=False, no_value=[] )
MetadataElement( name="reads", default=[], param=metadata.ListParameter, desc="Set of reads", readonly=True, visible=False, no_value=[] )
MetadataElement( name="lanes", default=[], param=metadata.ListParameter, desc="Set of lanes", readonly=True, visible=False, no_value=[] )
MetadataElement( name="barcodes", default=[], param=metadata.ListParameter, desc="Set of barcodes", readonly=True, visible=False, no_value=[] )
def __init__(self, **kwd):
"""Initialize taxonomy datatype"""
Tabular.__init__( self, **kwd )
self.column_names = ['MACHINE', 'RUN_NO', 'LANE', 'TILE', 'X', 'Y',
'INDEX', 'READ_NO', 'SEQ', 'QUAL', 'CHROM', 'CONTIG',
'POSITION', 'STRAND', 'DESC', 'SRAS', 'PRAS', 'PART_CHROM'
'PART_CONTIG', 'PART_OFFSET', 'PART_STRAND', 'FILT'
]
[docs] def make_html_table( self, dataset, skipchars=None ):
"""Create HTML table, used for displaying peek"""
if skipchars is None:
skipchars = []
out = ['<table cellspacing="0" cellpadding="3">']
try:
# Generate column header
out.append( '<tr>' )
for i, name in enumerate( self.column_names ):
out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
# This data type requires at least 11 columns in the data
if dataset.metadata.columns - len( self.column_names ) > 0:
for i in range( len( self.column_names ), dataset.metadata.columns ):
out.append( '<th>%s</th>' % str( i+1 ) )
out.append( '</tr>' )
out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
out.append( '</table>' )
out = "".join( out )
except Exception, exc:
out = "Can't create peek %s" % exc
return out
[docs] def sniff( self, filename ):
"""
Determines whether the file is in ELAND export format
A file in ELAND export format consists of lines of tab-separated data.
There is no header.
Rules for sniffing as True::
- There must be 22 columns on each line
- LANE, TILEm X, Y, INDEX, READ_NO, SEQ, QUAL, POSITION, *STRAND, FILT must be correct
- We will only check that up to the first 5 alignments are correctly formatted.
"""
try:
compress = is_gzip(filename)
if compress:
fh = gzip.GzipFile(filename, 'r')
else:
fh = open( filename )
count = 0
while True:
line = fh.readline()
line = line.strip()
if not line:
break #EOF
if line:
linePieces = line.split('\t')
if len(linePieces) != 22:
return False
try:
if long(linePieces[1]) < 0:
raise Exception('Out of range')
if long(linePieces[2]) < 0:
raise Exception('Out of range')
if long(linePieces[3]) < 0:
raise Exception('Out of range')
check = int(linePieces[4])
check = int(linePieces[5])
# can get a lot more specific
except ValueError:
fh.close()
return False
count += 1
if count == 5:
break
if count > 0:
fh.close()
return True
except:
pass
fh.close()
return False
[docs]class ElandMulti( Tabular ):
file_ext = 'elandmulti'
[docs] def sniff( self, filename ):
return False
[docs]class FeatureLocationIndex( Tabular ):
"""
An index that stores feature locations in tabular format.
"""
file_ext='fli'
MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
MetadataElement( name="column_types", default=['str', 'str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False, no_value=[] )