Package Bio :: Package Phylo :: Module CDAOIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.CDAOIO

  1  # Copyright (C) 2013 by Ben Morris (ben@bendmorris.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox 
  3  # and Bio.Phylo.Newick, copyright 2009 by Eric Talevich. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license. Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """I/O function wrappers for the RDF/CDAO file format. 
 10   
 11  This is an RDF format that conforms to the Comparative Data Analysis Ontology (CDAO). 
 12  See: http://www.evolutionaryontology.org/cdao 
 13   
 14  This module requires the librdf Python bindings (http://www.librdf.org) 
 15   
 16  The CDAOIO.Parser, in addition to parsing text files, can also parse directly 
 17  from a triple store that implements the Redland storage interface; similarly, 
 18  the CDAOIO.Writer can store triples in a triple store instead of serializing 
 19  them to a file. 
 20  """ 
 21   
 22  from Bio._py3k import StringIO 
 23   
 24  from Bio import MissingPythonDependencyError 
 25   
 26  from Bio.Phylo import CDAO 
 27  from ._cdao_owl import cdao_elements, cdao_namespaces, resolve_uri 
 28  import os 
29 30 31 -class CDAOError(Exception):
32 """Exception raised when CDAO object construction cannot continue (DEPRECATED).""" 33 34 pass
35 36 37 try: 38 import rdflib 39 rdfver = rdflib.__version__ 40 if rdfver[0] in ["1", "2"] or (rdfver in ["3.0.0", "3.1.0", "3.2.0"]): 41 raise MissingPythonDependencyError( 42 'Support for CDAO tree format requires RDFlib v3.2.1 or later.') 43 except ImportError: 44 raise MissingPythonDependencyError( 45 'Support for CDAO tree format requires RDFlib.') 46 47 RDF_NAMESPACES = { 48 'owl': 'http://www.w3.org/2002/07/owl#', 49 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 50 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 51 } 52 RDF_NAMESPACES.update(cdao_namespaces) 53 # pad node ids with zeroes until they're at least this length 54 ZEROES = 8
55 56 57 -def qUri(x):
58 """Resolve URI for librdf.""" 59 return resolve_uri(x, namespaces=RDF_NAMESPACES)
60
61 62 -def format_label(x):
63 """Format label for librdf.""" 64 return x.replace('_', ' ')
65
66 67 # --------------------------------------------------------- 68 # Public API 69 70 -def parse(handle, **kwargs):
71 """Iterate over the trees in a CDAO file handle. 72 73 :returns: generator of Bio.Phylo.CDAO.Tree objects. 74 75 """ 76 return Parser(handle).parse(**kwargs)
77
78 79 -def write(trees, handle, plain=False, **kwargs):
80 """Write a trees in CDAO format to the given file handle. 81 82 :returns: number of trees written. 83 84 """ 85 return Writer(trees).write(handle, plain=plain, **kwargs)
86
87 88 # --------------------------------------------------------- 89 # Input 90 91 -class Parser(object):
92 """Parse a CDAO tree given a file handle.""" 93
94 - def __init__(self, handle=None):
95 """initialize the value for CDAO tree parser.""" 96 self.handle = handle 97 self.graph = None 98 self.node_info = None 99 self.children = {} 100 self.rooted = False
101 102 @classmethod
103 - def from_string(cls, treetext):
104 """Instantiate the class from the given string.""" 105 handle = StringIO(treetext) 106 return cls(handle)
107
108 - def parse(self, **kwargs):
109 """Parse the text stream this object was initialized with.""" 110 self.parse_handle_to_graph(**kwargs) 111 return self.parse_graph()
112
113 - def parse_handle_to_graph(self, rooted=False, 114 parse_format='turtle', context=None, **kwargs):
115 """Parse self.handle into RDF model self.model.""" 116 if self.graph is None: 117 self.graph = rdflib.Graph() 118 graph = self.graph 119 120 for k, v in RDF_NAMESPACES.items(): 121 graph.bind(k, v) 122 123 self.rooted = rooted 124 125 if 'base_uri' in kwargs: 126 base_uri = kwargs['base_uri'] 127 else: 128 # Windows style slashes cannot be used in an RDF URI 129 base_uri = "file://" + os.path.abspath(self.handle.name).replace("\\", "/") 130 131 graph.parse(file=self.handle, publicID=base_uri, format=parse_format) 132 133 return self.parse_graph(graph, context=context)
134
135 - def parse_graph(self, graph=None, context=None):
136 """Generator that yields CDAO.Tree instances from an RDF model.""" 137 if graph is None: 138 graph = self.graph 139 140 # look up branch lengths/TUs for all nodes 141 self.get_node_info(graph, context=context) 142 143 for root_node in self.tree_roots: 144 clade = self.parse_children(root_node) 145 146 yield CDAO.Tree(root=clade, rooted=self.rooted)
147
148 - def new_clade(self, node):
149 """Returns a CDAO.Clade object for a given named node.""" 150 result = self.node_info[node] 151 152 kwargs = {} 153 if 'branch_length' in result: 154 kwargs['branch_length'] = result['branch_length'] 155 if 'label' in result: 156 kwargs['name'] = result['label'].replace('_', ' ') 157 if 'confidence' in result: 158 kwargs['confidence'] = result['confidence'] 159 160 clade = CDAO.Clade(**kwargs) 161 162 return clade
163
164 - def get_node_info(self, graph, context=None):
165 """Creates a dictionary containing information about all nodes in the tree.""" 166 self.node_info = {} 167 self.obj_info = {} 168 self.children = {} 169 self.nodes = set() 170 self.tree_roots = set() 171 172 assignments = { 173 qUri('cdao:has_Parent'): 'parent', 174 qUri('cdao:belongs_to_Edge_as_Child'): 'edge', 175 qUri('cdao:has_Annotation'): 'annotation', 176 qUri('cdao:has_Value'): 'value', 177 qUri('cdao:represents_TU'): 'tu', 178 qUri('rdfs:label'): 'label', 179 qUri('cdao:has_Support_Value'): 'confidence', 180 } 181 182 for s, v, o in graph: 183 # process each RDF triple in the graph sequentially 184 185 s, v, o = str(s), str(v), str(o) 186 187 if s not in self.obj_info: 188 self.obj_info[s] = {} 189 this = self.obj_info[s] 190 191 try: 192 # if the predicate is one we care about, store information for 193 # later 194 this[assignments[v]] = o 195 except KeyError: 196 pass 197 198 if v == qUri('rdf:type'): 199 if o in (qUri('cdao:AncestralNode'), qUri('cdao:TerminalNode')): 200 # this is a tree node; store it in set of all nodes 201 self.nodes.add(s) 202 if v == qUri('cdao:has_Root'): 203 # this is a tree; store its root in set of all tree roots 204 self.tree_roots.add(o) 205 206 for node in self.nodes: 207 # for each node, look up all information needed to create a 208 # CDAO.Clade 209 self.node_info[node] = {} 210 node_info = self.node_info[node] 211 212 obj = self.obj_info[node] 213 if 'edge' in obj: 214 # if this object points to an edge, we need a branch length from 215 # the annotation on that edge 216 edge = self.obj_info[obj['edge']] 217 if 'annotation' in edge: 218 annotation = self.obj_info[edge['annotation']] 219 if 'value' in annotation: 220 node_info['branch_length'] = float(annotation['value']) 221 222 if 'tu' in obj: 223 # if this object points to a TU, we need the label of that TU 224 tu = self.obj_info[obj['tu']] 225 if 'label' in tu: 226 node_info['label'] = tu['label'] 227 228 if 'parent' in obj: 229 # store this node as a child of its parent, if it has one, 230 # so that the tree can be traversed from parent to children 231 parent = obj['parent'] 232 if parent not in self.children: 233 self.children[parent] = [] 234 self.children[parent].append(node)
235
236 - def parse_children(self, node):
237 """Traverse the tree to create a nested clade structure. 238 239 Return a CDAO.Clade, and calls itself recursively for each child, 240 traversing the entire tree and creating a nested structure of CDAO.Clade 241 objects. 242 """ 243 clade = self.new_clade(node) 244 245 children = self.children[node] if node in self.children else [] 246 clade.clades = [ 247 self.parse_children(child_node) for child_node in children] 248 249 return clade
250
251 252 # --------------------------------------------------------- 253 # Output 254 255 -class Writer(object):
256 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 257 258 prefixes = RDF_NAMESPACES 259
260 - def __init__(self, trees):
261 """initialize parameters for writing a CDAO tree.""" 262 self.trees = trees 263 264 self.node_counter = 0 265 self.edge_counter = 0 266 self.tu_counter = 0 267 self.tree_counter = 0
268
269 - def write(self, handle, tree_uri='', record_complete_ancestry=False, 270 rooted=False, **kwargs):
271 """Write this instance's trees to a file handle.""" 272 self.rooted = rooted 273 self.record_complete_ancestry = record_complete_ancestry 274 275 if tree_uri and not tree_uri.endswith('/'): 276 tree_uri += '/' 277 278 trees = self.trees 279 280 if tree_uri: 281 handle.write('@base <%s>\n' % tree_uri) 282 for k, v in self.prefixes.items(): 283 handle.write('@prefix %s: <%s> .\n' % (k, v)) 284 285 handle.write('<%s> a owl:Ontology .\n' % self.prefixes['cdao']) 286 287 for tree in trees: 288 self.tree_counter += 1 289 self.tree_uri = 'tree%s' 290 291 first_clade = tree.clade 292 statements = self.process_clade(first_clade, root=tree) 293 for stmt in statements: 294 self.add_stmt_to_handle(handle, stmt)
295
296 - def add_stmt_to_handle(self, handle, stmt):
297 """Add URI prefix to handle.""" 298 # apply URI prefixes 299 stmt_strings = [] 300 for n, part in enumerate(stmt): 301 if isinstance(part, rdflib.URIRef): 302 node_uri = str(part) 303 changed = False 304 for prefix, uri in self.prefixes.items(): 305 if node_uri.startswith(uri): 306 node_uri = node_uri.replace(uri, '%s:' % prefix, 1) 307 if node_uri == 'rdf:type': 308 node_uri = 'a' 309 changed = True 310 if changed or ':' in node_uri: 311 stmt_strings.append(node_uri) 312 else: 313 stmt_strings.append('<%s>' % node_uri) 314 315 elif isinstance(part, rdflib.Literal): 316 stmt_strings.append(part.n3()) 317 318 else: 319 stmt_strings.append(str(part)) 320 321 handle.write('%s .\n' % ' '.join(stmt_strings))
322
323 - def process_clade(self, clade, parent=None, root=False):
324 """recursively generate triples describing a tree of clades.""" 325 self.node_counter += 1 326 clade.uri = 'node%s' % str(self.node_counter).zfill(ZEROES) 327 if parent: 328 clade.ancestors = parent.ancestors + [parent.uri] 329 else: 330 clade.ancestors = [] 331 332 nUri = lambda s: rdflib.URIRef(s) 333 pUri = lambda s: rdflib.URIRef(qUri(s)) 334 tree_id = nUri('') 335 336 statements = [] 337 338 if root is not False: 339 # create a cdao:RootedTree with reference to the tree root 340 tree_type = pUri('cdao:RootedTree') if self.rooted else pUri( 341 'cdao:UnrootedTree') 342 343 statements += [ 344 (tree_id, pUri('rdf:type'), tree_type), 345 (tree_id, pUri('cdao:has_Root'), nUri(clade.uri)), 346 ] 347 348 try: 349 tree_attributes = root.attributes 350 except AttributeError: 351 tree_attributes = [] 352 353 for predicate, obj in tree_attributes: 354 statements.append((tree_id, predicate, obj)) 355 356 if clade.name: 357 # create TU 358 self.tu_counter += 1 359 tu_uri = 'tu%s' % str(self.tu_counter).zfill(ZEROES) 360 361 statements += [ 362 (nUri(tu_uri), pUri('rdf:type'), pUri('cdao:TU')), 363 (nUri(clade.uri), pUri( 364 'cdao:represents_TU'), nUri(tu_uri)), 365 (nUri(tu_uri), pUri('rdfs:label'), 366 rdflib.Literal(format_label(clade.name))), 367 ] 368 369 try: 370 tu_attributes = clade.tu_attributes 371 except AttributeError: 372 tu_attributes = [] 373 374 for predicate, obj in tu_attributes: 375 yield (nUri(tu_uri), predicate, obj) 376 377 # create this node 378 node_type = 'cdao:TerminalNode' if clade.is_terminal( 379 ) else 'cdao:AncestralNode' 380 statements += [ 381 (nUri(clade.uri), pUri('rdf:type'), pUri(node_type)), 382 (nUri(clade.uri), pUri( 383 'cdao:belongs_to_Tree'), tree_id), 384 ] 385 386 if parent is not None: 387 # create edge from the parent node to this node 388 self.edge_counter += 1 389 edge_uri = 'edge%s' % str(self.edge_counter).zfill(ZEROES) 390 391 statements += [ 392 (nUri(edge_uri), pUri('rdf:type'), pUri('cdao:DirectedEdge')), 393 (nUri(edge_uri), pUri( 394 'cdao:belongs_to_Tree'), tree_id), 395 (nUri(edge_uri), pUri('cdao:has_Parent_Node'), 396 nUri(parent.uri)), 397 (nUri(edge_uri), pUri('cdao:has_Child_Node'), 398 nUri(clade.uri)), 399 (nUri(clade.uri), pUri( 400 'cdao:belongs_to_Edge_as_Child'), nUri(edge_uri)), 401 (nUri(clade.uri), pUri('cdao:has_Parent'), 402 nUri(parent.uri)), 403 (nUri(parent.uri), pUri( 404 'cdao:belongs_to_Edge_as_Parent'), nUri(edge_uri)), 405 ] 406 407 if hasattr(clade, 'confidence') and clade.confidence is not None: 408 confidence = rdflib.Literal( 409 clade.confidence, datatype='http://www.w3.org/2001/XMLSchema#decimal') 410 411 statements += [(nUri(clade.uri), 412 pUri('cdao:has_Support_Value'), confidence)] 413 414 if self.record_complete_ancestry and len(clade.ancestors) > 0: 415 statements += [(nUri(clade.uri), pUri('cdao:has_Ancestor'), nUri(ancestor)) 416 for ancestor in clade.ancestors] 417 418 if clade.branch_length is not None: 419 # add branch length 420 edge_ann_uri = 'edge_annotation%s' % str( 421 self.edge_counter).zfill(ZEROES) 422 423 branch_length = rdflib.Literal(clade.branch_length, datatype=rdflib.URIRef( 424 'http://www.w3.org/2001/XMLSchema#decimal')) 425 statements += [ 426 (nUri(edge_ann_uri), pUri('rdf:type'), 427 pUri('cdao:EdgeLength')), 428 (nUri(edge_uri), pUri('cdao:has_Annotation'), 429 nUri(edge_ann_uri)), 430 (nUri(edge_ann_uri), 431 pUri('cdao:has_Value'), branch_length), 432 ] 433 434 try: 435 edge_attributes = clade.edge_attributes 436 except AttributeError: 437 edge_attributes = [] 438 439 for predicate, obj in edge_attributes: 440 yield (nUri(edge_uri), predicate, obj) 441 442 for stmt in statements: 443 yield stmt 444 445 try: 446 clade_attributes = clade.attributes 447 except AttributeError: 448 clade_attributes = [] 449 450 for predicate, obj in clade_attributes: 451 yield (nUri(clade.uri), predicate, obj) 452 453 if not clade.is_terminal(): 454 for new_clade in clade.clades: 455 for stmt in self.process_clade(new_clade, parent=clade, root=False): 456 yield stmt
457