Source code for phylozoo.core.primitives.d_multigraph.io

"""
Directed multi-graph I/O module.

This module provides format handlers for reading and writing directed multi-graphs
to/from files. Format handlers are registered with FormatRegistry for use with
the IOMixin system.

The following format handlers are defined and registered:

- **dot**: DOT format (Graphviz) (extensions: .dot, .gv)
  - Writer: `to_dot()` - Converts DirectedMultiGraph to DOT string
  - Reader: `from_dot()` - Parses DOT string to DirectedMultiGraph
- **edgelist**: Edge-list format (extensions: .el)
  - Writer: `to_edgelist()` - Converts DirectedMultiGraph to edge-list string
  - Reader: `from_edgelist()` - Parses edge-list string to DirectedMultiGraph

These handlers are automatically registered when this module is imported.
DirectedMultiGraph inherits from IOMixin, so you can use:

- `graph.save('file.dot')` - Save to file (auto-detects format)
- `graph.load('file.dot')` - Load from file (auto-detects format)
- `graph.to_string(format='dot')` - Convert to string
- `graph.from_string(string, format='edgelist')` - Parse from string
- `DirectedMultiGraph.convert('in.dot', 'out.el')` - Convert between formats

Notes
-----
DOT format supports:
- Node attributes (label, shape, color, etc.)
- Edge attributes (label, weight, color, etc.)
- Graph attributes
- Parallel edges (multigraph support)

Edge-list format:
- Simple text format: one edge per line
- Format: `u v` or `u v key` or `u v key attr1=value1 attr2=value2`
- Uses node_id as the label/name
"""

from __future__ import annotations

import re
from typing import Any

from phylozoo.utils.exceptions import PhyloZooParseError

from phylozoo.utils.io import FormatRegistry
from .base import DirectedMultiGraph


def _escape_dot_string(s: str) -> str:
    """
    Escape a string for use in DOT format.

    Parameters
    ----------
    s : str
        String to escape.

    Returns
    -------
    str
        Escaped string.
    """
    # If string contains special characters or spaces, wrap in quotes
    if any(c in s for c in [" ", "\t", "\n", '"', "\\", "[", "]", "{", "}", "-", ">"]):
        # Escape backslashes and quotes
        s = s.replace("\\", "\\\\")
        s = s.replace('"', '\\"')
        return f'"{s}"'
    return s


def _format_dot_attributes(attrs: dict[str, Any]) -> str:
    """
    Format attributes for DOT format.

    Parameters
    ----------
    attrs : dict[str, Any]
        Dictionary of attributes.

    Returns
    -------
    str
        Formatted attribute string like '[key1=value1, key2=value2]'.
    """
    if not attrs:
        return ""

    parts = []
    for key, value in attrs.items():
        # Convert value to string and escape if needed
        if isinstance(value, str):
            value_str = _escape_dot_string(value)
        elif isinstance(value, (int, float)):
            value_str = str(value)
        elif isinstance(value, bool):
            value_str = "true" if value else "false"
        else:
            value_str = _escape_dot_string(str(value))

        parts.append(f"{key}={value_str}")

    return "[" + ", ".join(parts) + "]"


[docs] def to_dot(graph: DirectedMultiGraph, **kwargs: Any) -> str: """ Convert a DirectedMultiGraph to a DOT format string. Parameters ---------- graph : DirectedMultiGraph The directed multi-graph to convert. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- str The DOT format string representation of the graph. Examples -------- >>> from phylozoo.core.primitives.d_multigraph import DirectedMultiGraph >>> from phylozoo.core.primitives.d_multigraph.io import to_dot >>> >>> G = DirectedMultiGraph() >>> G.add_edge(1, 2, weight=1.0) 0 >>> G.add_edge(2, 3, weight=2.0) 0 >>> dot_str = to_dot(G) >>> 'digraph' in dot_str True >>> '1 -> 2' in dot_str True Notes ----- The DOT format includes: - digraph declaration - Node declarations with attributes - Edge declarations with attributes - Graph attributes (if any) - Support for parallel edges (multigraph) """ lines = [] # Graph name (optional, use empty string) graph_name = kwargs.get("graph_name", "") if graph_name: lines.append(f"digraph {_escape_dot_string(graph_name)} {{") else: lines.append("digraph {") # Graph attributes if hasattr(graph, "_graph") and hasattr(graph._graph, "graph"): graph_attrs = graph._graph.graph if graph_attrs: for key, value in graph_attrs.items(): if isinstance(value, str): value_str = _escape_dot_string(value) else: value_str = str(value) lines.append(f" {key}={value_str};") # Node declarations with attributes for node in graph.nodes(): node_attrs = {} if hasattr(graph, "_graph") and node in graph._graph: node_data = graph._graph.nodes[node] if node_data: node_attrs = dict(node_data) # Use node_id as label if no label attribute (as per user requirement) if "label" not in node_attrs: node_attrs["label"] = str(node) node_id_str = _escape_dot_string(str(node)) attrs_str = _format_dot_attributes(node_attrs) if attrs_str: lines.append(f" {node_id_str} {attrs_str};") else: lines.append(f" {node_id_str};") # Edge declarations with attributes for u, v, key, data in graph.edges_iter(keys=True, data=True): u_str = _escape_dot_string(str(u)) v_str = _escape_dot_string(str(v)) # Include key in edge attributes if there are parallel edges edge_attrs = dict(data) if data else {} # Add key as attribute if there are multiple edges between u and v if graph._graph.number_of_edges(u, v) > 1: edge_attrs["key"] = key attrs_str = _format_dot_attributes(edge_attrs) if attrs_str: lines.append(f" {u_str} -> {v_str} {attrs_str};") else: lines.append(f" {u_str} -> {v_str};") lines.append("}") return "\n".join(lines) + "\n"
[docs] def from_dot(dot_string: str, **kwargs: Any) -> DirectedMultiGraph: """ Parse a DOT format string and create a DirectedMultiGraph. Parameters ---------- dot_string : str DOT format string containing graph data. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- DirectedMultiGraph Parsed directed multi-graph. Raises ------ PhyloZooParseError If the DOT string is malformed or cannot be parsed. Examples -------- >>> from phylozoo.core.primitives.d_multigraph.io import from_dot >>> >>> dot_str = '''digraph { ... 1 [label="Node1"]; ... 2 [label="Node2"]; ... 1 -> 2 [weight=1.0]; ... 2 -> 3 [weight=2.0]; ... }''' >>> >>> G = from_dot(dot_str) >>> G.number_of_nodes() 3 >>> G.number_of_edges() 2 Notes ----- This parser expects: - digraph declaration - Node declarations (optional attributes) - Edge declarations (optional attributes) - Graph attributes (optional) - Support for parallel edges """ # Remove comments lines = [] for line in dot_string.split("\n"): # Remove C-style comments (// and /* */) # Remove # comments if "//" in line: line = line[: line.index("//")] if "#" in line and not line.strip().startswith("#"): # Only remove # if it's not part of a string pass # Keep for now, will handle in parsing lines.append(line) content = "\n".join(lines) # Extract graph name and body digraph_match = re.search(r'digraph\s+(\w+|"[^"]+")?\s*\{', content, re.IGNORECASE) if not digraph_match: raise PhyloZooParseError("Could not find digraph declaration in DOT string") # Extract graph body (between { and }) brace_count = 0 start_idx = content.index("{") end_idx = start_idx for i, char in enumerate(content[start_idx:], start=start_idx): if char == "{": brace_count += 1 elif char == "}": brace_count -= 1 if brace_count == 0: end_idx = i break if brace_count != 0: raise PhyloZooParseError("Unmatched braces in DOT string") body = content[start_idx + 1 : end_idx] # Parse graph attributes, nodes, and edges graph_attrs = {} nodes_data: dict[Any, dict[str, Any]] = {} edges_data: list[tuple[Any, Any, int | None, dict[str, Any]]] = [] # Node pattern: node_id [attributes]; node_pattern = r'(\w+|"[^"]+")\s*(?:\[([^\]]+)\])?\s*;' # Edge pattern: u -> v [attributes]; edge_pattern = r'(\w+|"[^"]+")\s*->\s*(\w+|"[^"]+")\s*(?:\[([^\]]+)\])?\s*;' # Graph attribute pattern: key=value; (standalone, not in brackets) graph_attr_pattern = r"^(\w+)\s*=\s*([^;]+);$" for line in body.split("\n"): line = line.strip() if not line or line.startswith("//") or line.startswith("#"): continue # Try to match graph attribute first (standalone key=value;) graph_attr_match = re.match(graph_attr_pattern, line) if graph_attr_match: key = graph_attr_match.group(1).strip() value = graph_attr_match.group(2).strip().strip("\"'") graph_attrs[key] = value continue # Try to match edge (edges contain ->) edge_match = re.search(edge_pattern, line) if edge_match: u_str = edge_match.group(1).strip("\"'") v_str = edge_match.group(2).strip("\"'") attrs_str = edge_match.group(3) if edge_match.group(3) else "" # Parse attributes edge_attrs = _parse_dot_attributes(attrs_str) # Extract key if present key = None if "key" in edge_attrs: try: key = int(edge_attrs.pop("key")) except (ValueError, TypeError): pass # Convert node strings to appropriate types u = _convert_node_id(u_str) v = _convert_node_id(v_str) edges_data.append((u, v, key, edge_attrs)) continue # Try to match node node_match = re.search(node_pattern, line) if node_match: node_str = node_match.group(1).strip("\"'") attrs_str = node_match.group(2) if node_match.group(2) else "" # Parse attributes node_attrs = _parse_dot_attributes(attrs_str) # Convert node string to appropriate type node_id = _convert_node_id(node_str) nodes_data[node_id] = node_attrs continue # Create graph graph: Any = DirectedMultiGraph(attributes=graph_attrs if graph_attrs else None) # Add nodes with attributes for node_id, attrs in nodes_data.items(): graph.add_node(node_id, **attrs) # Add edges with attributes for u, v, key, attrs in edges_data: # Ensure nodes exist if u not in graph: graph.add_node(u) if v not in graph: graph.add_node(v) graph.add_edge(u, v, key=key, **attrs) return graph # type: ignore[no-any-return]
def _parse_dot_attributes(attrs_str: str) -> dict[str, Any]: """ Parse DOT attribute string like 'key1=value1, key2=value2'. Parameters ---------- attrs_str : str Attribute string. Returns ------- dict[str, Any] Dictionary of attributes. """ attrs: dict[Any, Any] = {} if not attrs_str.strip(): return attrs # Split by comma, but respect quoted strings parts = [] current = "" in_quotes = False escape_next = False for char in attrs_str: if escape_next: current += char escape_next = False continue if char == "\\": escape_next = True current += char continue if char == '"' or char == "'": in_quotes = not in_quotes current += char continue if char == "," and not in_quotes: parts.append(current.strip()) current = "" else: current += char if current.strip(): parts.append(current.strip()) # Parse each key=value pair for part in parts: if "=" not in part: continue key, value = part.split("=", 1) key = key.strip() value = value.strip().strip("\"'") # Try to convert to appropriate type if value.lower() == "true": attrs[key] = True elif value.lower() == "false": attrs[key] = False else: # Try numeric conversion try: if "." in value: attrs[key] = float(value) else: attrs[key] = int(value) except ValueError: attrs[key] = value return attrs def _convert_node_id(node_str: str) -> Any: """ Convert node string to appropriate type (int, float, or str). Parameters ---------- node_str : str Node string. Returns ------- Any Converted node ID. """ # Try int first try: return int(node_str) except ValueError: pass # Try float try: return float(node_str) except ValueError: pass # Keep as string return node_str
[docs] def to_edgelist(graph: DirectedMultiGraph, **kwargs: Any) -> str: """ Convert a DirectedMultiGraph to an edge-list format string. Parameters ---------- graph : DirectedMultiGraph The directed multi-graph to convert. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- str The edge-list format string representation of the graph. Examples -------- >>> from phylozoo.core.primitives.d_multigraph import DirectedMultiGraph >>> from phylozoo.core.primitives.d_multigraph.io import to_edgelist >>> >>> G = DirectedMultiGraph() >>> G.add_edge(1, 2, weight=1.0) 0 >>> G.add_edge(2, 3, weight=2.0) 0 >>> el_str = to_edgelist(G) >>> '1 2' in el_str True >>> '2 3' in el_str True Notes ----- The edge-list format: - One edge per line - Format: `u v` or `u v key` or `u v key attr1=value1 attr2=value2` - Uses node_id as the label/name - Includes edge keys for parallel edges - Includes edge attributes if present """ lines = [] for u, v, key, data in graph.edges_iter(keys=True, data=True): u_str = str(u) v_str = str(v) # Build line: u v [key] [attributes] line_parts = [u_str, v_str] # Add key if there are parallel edges if graph._graph.number_of_edges(u, v) > 1: line_parts.append(str(key)) # Add attributes if data: for attr_key, attr_value in data.items(): if isinstance(attr_value, str): # Escape spaces in string values if " " in attr_value: attr_value = f'"{attr_value}"' line_parts.append(f"{attr_key}={attr_value}") else: line_parts.append(f"{attr_key}={attr_value}") lines.append(" ".join(line_parts)) return "\n".join(lines) + "\n"
[docs] def from_edgelist(edgelist_string: str, **kwargs: Any) -> DirectedMultiGraph: """ Parse an edge-list format string and create a DirectedMultiGraph. Parameters ---------- edgelist_string : str Edge-list format string containing graph data. **kwargs Additional arguments (currently unused, for compatibility). Returns ------- DirectedMultiGraph Parsed directed multi-graph. Raises ------ PhyloZooParseError If the edge-list string is malformed or cannot be parsed. Examples -------- >>> from phylozoo.core.primitives.d_multigraph.io import from_edgelist >>> >>> el_str = '''1 2 ... 2 3 weight=2.0 ... 3 4 0 key1=value1''' >>> >>> G = from_edgelist(el_str) >>> G.number_of_nodes() 4 >>> G.number_of_edges() 3 Notes ----- This parser expects: - One edge per line - Format: `u v` or `u v key` or `u v key attr1=value1 attr2=value2` - Uses node_id as the label/name """ graph: Any = DirectedMultiGraph() for line in edgelist_string.strip().split("\n"): line = line.strip() if not line or line.startswith("#"): continue parts = line.split() if len(parts) < 2: raise PhyloZooParseError(f"Invalid edge line (need at least 2 values): {line}") u_str = parts[0] v_str = parts[1] # Convert node strings to appropriate types u = _convert_node_id(u_str) v = _convert_node_id(v_str) # Parse key and attributes key = None attrs = {} if len(parts) > 2: # Check if third part is a key (integer) or an attribute third_part = parts[2] if "=" not in third_part: # It's a key try: key = int(third_part) start_idx = 3 except ValueError: start_idx = 2 else: start_idx = 2 # Parse attributes for part in parts[start_idx:]: if "=" not in part: continue attr_key, attr_value = part.split("=", 1) attr_value = attr_value.strip("\"'") # Try to convert to appropriate type try: if "." in attr_value: attrs[attr_key] = float(attr_value) else: attrs[attr_key] = int(attr_value) except ValueError: attrs[attr_key] = attr_value # Add edge graph.add_edge(u, v, key=key, **attrs) return graph # type: ignore[no-any-return]
# Register format handlers with FormatRegistry FormatRegistry.register( DirectedMultiGraph, "dot", reader=from_dot, writer=to_dot, extensions=[".dot", ".gv"], default=True, ) FormatRegistry.register( DirectedMultiGraph, "edgelist", reader=from_edgelist, writer=to_edgelist, extensions=[".el"] )