Source code for phylozoo.core.primitives.d_multigraph.io

"""
Directed multi-graph I/O module.

This module provides format handlers for reading and writing directed multi-graphs
to/from files. Format handlers are registered with FormatRegistry for use with
the IOMixin system.

The following format handlers are defined and registered:

- **dot**: DOT format (Graphviz) (extensions: .dot, .gv)
  - Writer: `to_dot()` - Converts DirectedMultiGraph to DOT string
  - Reader: `from_dot()` - Parses DOT string to DirectedMultiGraph
- **edgelist**: Edge-list format (extensions: .el)
  - Writer: `to_edgelist()` - Converts DirectedMultiGraph to edge-list string
  - Reader: `from_edgelist()` - Parses edge-list string to DirectedMultiGraph

These handlers are automatically registered when this module is imported.
DirectedMultiGraph inherits from IOMixin, so you can use:

- `graph.save('file.dot')` - Save to file (auto-detects format)
- `graph.load('file.dot')` - Load from file (auto-detects format)
- `graph.to_string(format='dot')` - Convert to string
- `graph.from_string(string, format='edgelist')` - Parse from string
- `DirectedMultiGraph.convert('in.dot', 'out.el')` - Convert between formats

Notes
-----
DOT format supports:
- Node attributes (label, shape, color, etc.)
- Edge attributes (label, weight, color, etc.)
- Graph attributes
- Parallel edges (multigraph support)

Edge-list format:
- Simple text format: one edge per line
- Format: `u v` or `u v key` or `u v key attr1=value1 attr2=value2`
- Uses node_id as the label/name
"""

from __future__ import annotations

import re
from typing import Any

from phylozoo.utils.exceptions import PhyloZooParseError

from phylozoo.utils.io import FormatRegistry
from .base import DirectedMultiGraph


def _escape_dot_string(s: str) -> str:
    """
    Escape a string for use in DOT format.
    
    Parameters
    ----------
    s : str
        String to escape.
    
    Returns
    -------
    str
        Escaped string.
    """
    # If string contains special characters or spaces, wrap in quotes
    if any(c in s for c in [' ', '\t', '\n', '"', '\\', '[', ']', '{', '}', '-', '>']):
        # Escape backslashes and quotes
        s = s.replace('\\', '\\\\')
        s = s.replace('"', '\\"')
        return f'"{s}"'
    return s


def _format_dot_attributes(attrs: dict[str, Any]) -> str:
    """
    Format attributes for DOT format.
    
    Parameters
    ----------
    attrs : dict[str, Any]
        Dictionary of attributes.
    
    Returns
    -------
    str
        Formatted attribute string like '[key1=value1, key2=value2]'.
    """
    if not attrs:
        return ''
    
    parts = []
    for key, value in attrs.items():
        # Convert value to string and escape if needed
        if isinstance(value, str):
            value_str = _escape_dot_string(value)
        elif isinstance(value, (int, float)):
            value_str = str(value)
        elif isinstance(value, bool):
            value_str = 'true' if value else 'false'
        else:
            value_str = _escape_dot_string(str(value))
        
        parts.append(f'{key}={value_str}')
    
    return '[' + ', '.join(parts) + ']'


[docs] def to_dot(graph: DirectedMultiGraph, **kwargs: Any) -> str: """ Convert a DirectedMultiGraph to a DOT format string. Parameters ---------- graph : DirectedMultiGraph The directed multi-graph to convert. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- str The DOT format string representation of the graph. Examples -------- >>> from phylozoo.core.primitives.d_multigraph import DirectedMultiGraph >>> from phylozoo.core.primitives.d_multigraph.io import to_dot >>> >>> G = DirectedMultiGraph() >>> G.add_edge(1, 2, weight=1.0) 0 >>> G.add_edge(2, 3, weight=2.0) 0 >>> dot_str = to_dot(G) >>> 'digraph' in dot_str True >>> '1 -> 2' in dot_str True Notes ----- The DOT format includes: - digraph declaration - Node declarations with attributes - Edge declarations with attributes - Graph attributes (if any) - Support for parallel edges (multigraph) """ lines = [] # Graph name (optional, use empty string) graph_name = kwargs.get('graph_name', '') if graph_name: lines.append(f'digraph {_escape_dot_string(graph_name)} {{') else: lines.append('digraph {') # Graph attributes if hasattr(graph, '_graph') and hasattr(graph._graph, 'graph'): graph_attrs = graph._graph.graph if graph_attrs: for key, value in graph_attrs.items(): if isinstance(value, str): value_str = _escape_dot_string(value) else: value_str = str(value) lines.append(f' {key}={value_str};') # Node declarations with attributes for node in graph.nodes(): node_attrs = {} if hasattr(graph, '_graph') and node in graph._graph: node_data = graph._graph.nodes[node] if node_data: node_attrs = dict(node_data) # Use node_id as label if no label attribute (as per user requirement) if 'label' not in node_attrs: node_attrs['label'] = str(node) node_id_str = _escape_dot_string(str(node)) attrs_str = _format_dot_attributes(node_attrs) if attrs_str: lines.append(f' {node_id_str} {attrs_str};') else: lines.append(f' {node_id_str};') # Edge declarations with attributes for u, v, key, data in graph.edges_iter(keys=True, data=True): u_str = _escape_dot_string(str(u)) v_str = _escape_dot_string(str(v)) # Include key in edge attributes if there are parallel edges edge_attrs = dict(data) if data else {} # Add key as attribute if there are multiple edges between u and v if graph._graph.number_of_edges(u, v) > 1: edge_attrs['key'] = key attrs_str = _format_dot_attributes(edge_attrs) if attrs_str: lines.append(f' {u_str} -> {v_str} {attrs_str};') else: lines.append(f' {u_str} -> {v_str};') lines.append('}') return '\n'.join(lines) + '\n'
[docs] def from_dot(dot_string: str, **kwargs: Any) -> DirectedMultiGraph: """ Parse a DOT format string and create a DirectedMultiGraph. Parameters ---------- dot_string : str DOT format string containing graph data. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- DirectedMultiGraph Parsed directed multi-graph. Raises ------ PhyloZooParseError If the DOT string is malformed or cannot be parsed. Examples -------- >>> from phylozoo.core.primitives.d_multigraph.io import from_dot >>> >>> dot_str = '''digraph { ... 1 [label="Node1"]; ... 2 [label="Node2"]; ... 1 -> 2 [weight=1.0]; ... 2 -> 3 [weight=2.0]; ... }''' >>> >>> G = from_dot(dot_str) >>> G.number_of_nodes() 3 >>> G.number_of_edges() 2 Notes ----- This parser expects: - digraph declaration - Node declarations (optional attributes) - Edge declarations (optional attributes) - Graph attributes (optional) - Support for parallel edges """ # Remove comments lines = [] for line in dot_string.split('\n'): # Remove C-style comments (// and /* */) # Remove # comments if '//' in line: line = line[:line.index('//')] if '#' in line and not line.strip().startswith('#'): # Only remove # if it's not part of a string pass # Keep for now, will handle in parsing lines.append(line) content = '\n'.join(lines) # Extract graph name and body digraph_match = re.search(r'digraph\s+(\w+|"[^"]+")?\s*\{', content, re.IGNORECASE) if not digraph_match: raise PhyloZooParseError("Could not find digraph declaration in DOT string") # Extract graph body (between { and }) brace_count = 0 start_idx = content.index('{') end_idx = start_idx for i, char in enumerate(content[start_idx:], start=start_idx): if char == '{': brace_count += 1 elif char == '}': brace_count -= 1 if brace_count == 0: end_idx = i break if brace_count != 0: raise PhyloZooParseError("Unmatched braces in DOT string") body = content[start_idx + 1:end_idx] # Parse graph attributes, nodes, and edges graph_attrs = {} nodes_data: dict[Any, dict[str, Any]] = {} edges_data: list[tuple[Any, Any, int | None, dict[str, Any]]] = [] # Node pattern: node_id [attributes]; node_pattern = r'(\w+|"[^"]+")\s*(?:\[([^\]]+)\])?\s*;' # Edge pattern: u -> v [attributes]; edge_pattern = r'(\w+|"[^"]+")\s*->\s*(\w+|"[^"]+")\s*(?:\[([^\]]+)\])?\s*;' # Graph attribute pattern: key=value; (standalone, not in brackets) graph_attr_pattern = r'^(\w+)\s*=\s*([^;]+);$' for line in body.split('\n'): line = line.strip() if not line or line.startswith('//') or line.startswith('#'): continue # Try to match graph attribute first (standalone key=value;) graph_attr_match = re.match(graph_attr_pattern, line) if graph_attr_match: key = graph_attr_match.group(1).strip() value = graph_attr_match.group(2).strip().strip('"\'') graph_attrs[key] = value continue # Try to match edge (edges contain ->) edge_match = re.search(edge_pattern, line) if edge_match: u_str = edge_match.group(1).strip('"\'') v_str = edge_match.group(2).strip('"\'') attrs_str = edge_match.group(3) if edge_match.group(3) else '' # Parse attributes edge_attrs = _parse_dot_attributes(attrs_str) # Extract key if present key = None if 'key' in edge_attrs: try: key = int(edge_attrs.pop('key')) except (ValueError, TypeError): pass # Convert node strings to appropriate types u = _convert_node_id(u_str) v = _convert_node_id(v_str) edges_data.append((u, v, key, edge_attrs)) continue # Try to match node node_match = re.search(node_pattern, line) if node_match: node_str = node_match.group(1).strip('"\'') attrs_str = node_match.group(2) if node_match.group(2) else '' # Parse attributes node_attrs = _parse_dot_attributes(attrs_str) # Convert node string to appropriate type node_id = _convert_node_id(node_str) nodes_data[node_id] = node_attrs continue # Create graph graph = DirectedMultiGraph(attributes=graph_attrs if graph_attrs else None) # Add nodes with attributes for node_id, attrs in nodes_data.items(): graph.add_node(node_id, **attrs) # Add edges with attributes for u, v, key, attrs in edges_data: # Ensure nodes exist if u not in graph: graph.add_node(u) if v not in graph: graph.add_node(v) graph.add_edge(u, v, key=key, **attrs) return graph
def _parse_dot_attributes(attrs_str: str) -> dict[str, Any]: """ Parse DOT attribute string like 'key1=value1, key2=value2'. Parameters ---------- attrs_str : str Attribute string. Returns ------- dict[str, Any] Dictionary of attributes. """ attrs = {} if not attrs_str.strip(): return attrs # Split by comma, but respect quoted strings parts = [] current = '' in_quotes = False escape_next = False for char in attrs_str: if escape_next: current += char escape_next = False continue if char == '\\': escape_next = True current += char continue if char == '"' or char == "'": in_quotes = not in_quotes current += char continue if char == ',' and not in_quotes: parts.append(current.strip()) current = '' else: current += char if current.strip(): parts.append(current.strip()) # Parse each key=value pair for part in parts: if '=' not in part: continue key, value = part.split('=', 1) key = key.strip() value = value.strip().strip('"\'') # Try to convert to appropriate type if value.lower() == 'true': attrs[key] = True elif value.lower() == 'false': attrs[key] = False else: # Try numeric conversion try: if '.' in value: attrs[key] = float(value) else: attrs[key] = int(value) except ValueError: attrs[key] = value return attrs def _convert_node_id(node_str: str) -> Any: """ Convert node string to appropriate type (int, float, or str). Parameters ---------- node_str : str Node string. Returns ------- Any Converted node ID. """ # Try int first try: return int(node_str) except ValueError: pass # Try float try: return float(node_str) except ValueError: pass # Keep as string return node_str
[docs] def to_edgelist(graph: DirectedMultiGraph, **kwargs: Any) -> str: """ Convert a DirectedMultiGraph to an edge-list format string. Parameters ---------- graph : DirectedMultiGraph The directed multi-graph to convert. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- str The edge-list format string representation of the graph. Examples -------- >>> from phylozoo.core.primitives.d_multigraph import DirectedMultiGraph >>> from phylozoo.core.primitives.d_multigraph.io import to_edgelist >>> >>> G = DirectedMultiGraph() >>> G.add_edge(1, 2, weight=1.0) 0 >>> G.add_edge(2, 3, weight=2.0) 0 >>> el_str = to_edgelist(G) >>> '1 2' in el_str True >>> '2 3' in el_str True Notes ----- The edge-list format: - One edge per line - Format: `u v` or `u v key` or `u v key attr1=value1 attr2=value2` - Uses node_id as the label/name - Includes edge keys for parallel edges - Includes edge attributes if present """ lines = [] for u, v, key, data in graph.edges_iter(keys=True, data=True): u_str = str(u) v_str = str(v) # Build line: u v [key] [attributes] line_parts = [u_str, v_str] # Add key if there are parallel edges if graph._graph.number_of_edges(u, v) > 1: line_parts.append(str(key)) # Add attributes if data: for attr_key, attr_value in data.items(): if isinstance(attr_value, str): # Escape spaces in string values if ' ' in attr_value: attr_value = f'"{attr_value}"' line_parts.append(f'{attr_key}={attr_value}') else: line_parts.append(f'{attr_key}={attr_value}') lines.append(' '.join(line_parts)) return '\n'.join(lines) + '\n'
[docs] def from_edgelist(edgelist_string: str, **kwargs: Any) -> DirectedMultiGraph: """ Parse an edge-list format string and create a DirectedMultiGraph. Parameters ---------- edgelist_string : str Edge-list format string containing graph data. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- DirectedMultiGraph Parsed directed multi-graph. Raises ------ PhyloZooParseError If the edge-list string is malformed or cannot be parsed. Examples -------- >>> from phylozoo.core.primitives.d_multigraph.io import from_edgelist >>> >>> el_str = '''1 2 ... 2 3 weight=2.0 ... 3 4 0 key1=value1''' >>> >>> G = from_edgelist(el_str) >>> G.number_of_nodes() 4 >>> G.number_of_edges() 3 Notes ----- This parser expects: - One edge per line - Format: `u v` or `u v key` or `u v key attr1=value1 attr2=value2` - Uses node_id as the label/name """ graph = DirectedMultiGraph() for line in edgelist_string.strip().split('\n'): line = line.strip() if not line or line.startswith('#'): continue parts = line.split() if len(parts) < 2: raise PhyloZooParseError(f"Invalid edge line (need at least 2 values): {line}") u_str = parts[0] v_str = parts[1] # Convert node strings to appropriate types u = _convert_node_id(u_str) v = _convert_node_id(v_str) # Parse key and attributes key = None attrs = {} if len(parts) > 2: # Check if third part is a key (integer) or an attribute third_part = parts[2] if '=' not in third_part: # It's a key try: key = int(third_part) start_idx = 3 except ValueError: start_idx = 2 else: start_idx = 2 # Parse attributes for part in parts[start_idx:]: if '=' not in part: continue attr_key, attr_value = part.split('=', 1) attr_value = attr_value.strip('"\'') # Try to convert to appropriate type try: if '.' in attr_value: attrs[attr_key] = float(attr_value) else: attrs[attr_key] = int(attr_value) except ValueError: attrs[attr_key] = attr_value # Add edge graph.add_edge(u, v, key=key, **attrs) return graph
# Register format handlers with FormatRegistry FormatRegistry.register( DirectedMultiGraph, 'dot', reader=from_dot, writer=to_dot, extensions=['.dot', '.gv'], default=True ) FormatRegistry.register( DirectedMultiGraph, 'edgelist', reader=from_edgelist, writer=to_edgelist, extensions=['.el'] )