Skip to content

compare

A collection of utilities for canonicalizing and inspecting graphs.

Among other things, they solve of the problem of deterministic bnode comparisons.

Warning: the time to canonicalize bnodes may increase exponentially on degenerate larger graphs. Use with care!

Example of comparing two graphs:

>>> g1 = Graph().parse(format='n3', data='''
...     @prefix : <http://example.org/ns#> .
...     <http://example.org> :rel
...         <http://example.org/same>,
...         [ :label "Same" ],
...         <http://example.org/a>,
...         [ :label "A" ] .
... ''')
>>> g2 = Graph().parse(format='n3', data='''
...     @prefix : <http://example.org/ns#> .
...     <http://example.org> :rel
...         <http://example.org/same>,
...         [ :label "Same" ],
...         <http://example.org/b>,
...         [ :label "B" ] .
... ''')
>>>
>>> iso1 = to_isomorphic(g1)
>>> iso2 = to_isomorphic(g2)

These are not isomorphic

>>> iso1 == iso2
False

Diff the two graphs:

>>> in_both, in_first, in_second = graph_diff(iso1, iso2)

Present in both:

>>> def dump_nt_sorted(g):
...     for l in sorted(g.serialize(format='nt').splitlines()):
...         if l: print(l.decode('ascii'))
>>> dump_nt_sorted(in_both) #doctest: +SKIP
<http://example.org>
    <http://example.org/ns#rel> <http://example.org/same> .
<http://example.org>
    <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e .
_:cbcaabaaba17fecbc304a64f8edee4335e
    <http://example.org/ns#label> "Same" .

Only in first:

>>> dump_nt_sorted(in_first) #doctest: +SKIP
<http://example.org>
    <http://example.org/ns#rel> <http://example.org/a> .
<http://example.org>
    <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 .
_:cb124e4c6da0579f810c0ffe4eff485bd9
    <http://example.org/ns#label> "A" .

Only in second:

>>> dump_nt_sorted(in_second) #doctest: +SKIP
<http://example.org>
    <http://example.org/ns#rel> <http://example.org/b> .
<http://example.org>
    <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 .
_:cb558f30e21ddfc05ca53108348338ade8
    <http://example.org/ns#label> "B" .

Classes:

  • IsomorphicGraph

    An implementation of the RGDA1 graph digest algorithm.

Functions:

ColorItem module-attribute

ColorItem = Tuple[Union[int, str], URIRef, Union[int, str]]

ColorItemTuple module-attribute

ColorItemTuple = Tuple[ColorItem, ...]

HashCache module-attribute

HashCache = Optional[Dict[ColorItemTuple, str]]

HashFunc module-attribute

HashFunc = Callable[[str], int]

Stats module-attribute

Stats = Dict[str, Union[int, str]]

__all__ module-attribute

__all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic', 'to_canonical_graph', 'graph_diff', 'similar']

Color

Color(nodes: List[IdentifiedNode], hashfunc: HashFunc, color: ColorItemTuple = (), hash_cache: HashCache = None)

Methods:

Attributes:

Source code in rdflib/compare.py
def __init__(
    self,
    nodes: List[IdentifiedNode],
    hashfunc: HashFunc,
    color: ColorItemTuple = (),
    hash_cache: HashCache = None,
):
    if hash_cache is None:
        hash_cache = {}
    self._hash_cache = hash_cache
    self.color = color
    self.nodes = nodes
    self.hashfunc = hashfunc
    self._hash_color = None

color instance-attribute

color = color

hashfunc instance-attribute

hashfunc = hashfunc

nodes instance-attribute

nodes = nodes

__str__

__str__()
Source code in rdflib/compare.py
def __str__(self):
    nodes, color = self.key()
    return "Color %s (%s nodes)" % (color, nodes)

copy

copy()
Source code in rdflib/compare.py
def copy(self):
    return Color(
        self.nodes[:], self.hashfunc, self.color, hash_cache=self._hash_cache
    )

discrete

discrete()
Source code in rdflib/compare.py
def discrete(self):
    return len(self.nodes) == 1

distinguish

distinguish(W: Color, graph: Graph)
Source code in rdflib/compare.py
def distinguish(self, W: Color, graph: Graph):  # noqa: N803
    colors: Dict[str, Color] = {}
    for n in self.nodes:
        new_color: Tuple[ColorItem, ...] = list(self.color)  # type: ignore[assignment]
        for node in W.nodes:
            new_color += [  # type: ignore[operator]
                (1, p, W.hash_color()) for s, p, o in graph.triples((n, None, node))
            ]
            new_color += [  # type: ignore[operator]
                (W.hash_color(), p, 3) for s, p, o in graph.triples((node, None, n))
            ]
        new_color = tuple(new_color)
        new_hash_color = self.hash_color(new_color)

        if new_hash_color not in colors:
            c = Color([], self.hashfunc, new_color, hash_cache=self._hash_cache)
            colors[new_hash_color] = c
        colors[new_hash_color].nodes.append(n)
    return colors.values()

hash_color

hash_color(color: Optional[Tuple[ColorItem, ...]] = None) -> str
Source code in rdflib/compare.py
def hash_color(self, color: Optional[Tuple[ColorItem, ...]] = None) -> str:
    if color is None:
        color = self.color
    if color in self._hash_cache:
        return self._hash_cache[color]

    def stringify(x):
        if isinstance(x, Node):
            return x.n3()
        else:
            return str(x)

    if isinstance(color, Node):
        return stringify(color)
    value = 0
    for triple in color:
        value += self.hashfunc(" ".join([stringify(x) for x in triple]))
    val: str = "%x" % value
    self._hash_cache[color] = val
    return val

key

key()
Source code in rdflib/compare.py
def key(self):
    return (len(self.nodes), self.hash_color())

IsomorphicGraph

IsomorphicGraph(**kwargs)

Bases: ConjunctiveGraph

An implementation of the RGDA1 graph digest algorithm.

An implementation of RGDA1 (publication below), a combination of Sayers & Karp’s graph digest algorithm using sum and SHA-256 http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf and traces http://pallini.di.uniroma1.it, an average case polynomial time algorithm for graph canonicalization.

McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web. Rensselaer Polytechnic Institute, Troy, NY. http://gradworks.umi.com/3727015.pdf

Methods:

  • __eq__

    Graph isomorphism testing.

  • __hash__
  • __ne__

    Negative graph isomorphism testing.

  • graph_digest

    Synonym for IsomorphicGraph.internal_hash.

  • internal_hash

    This is defined instead of __hash__ to avoid a circular recursion

Source code in rdflib/compare.py
def __init__(self, **kwargs):
    super(IsomorphicGraph, self).__init__(**kwargs)

__eq__

__eq__(other)

Graph isomorphism testing.

Source code in rdflib/compare.py
def __eq__(self, other):
    """Graph isomorphism testing."""
    if not isinstance(other, IsomorphicGraph):
        return False
    elif len(self) != len(other):
        return False
    return self.internal_hash() == other.internal_hash()

__hash__

__hash__()
Source code in rdflib/compare.py
def __hash__(self):
    return super(IsomorphicGraph, self).__hash__()

__ne__

__ne__(other)

Negative graph isomorphism testing.

Source code in rdflib/compare.py
def __ne__(self, other):
    """Negative graph isomorphism testing."""
    return not self.__eq__(other)

graph_digest

graph_digest(stats=None)

Synonym for IsomorphicGraph.internal_hash.

Source code in rdflib/compare.py
def graph_digest(self, stats=None):
    """Synonym for IsomorphicGraph.internal_hash."""
    return self.internal_hash(stats=stats)

internal_hash

internal_hash(stats=None)

This is defined instead of __hash__ to avoid a circular recursion scenario with the Memory store for rdflib which requires a hash lookup in order to return a generator of triples.

Source code in rdflib/compare.py
def internal_hash(self, stats=None):
    """
    This is defined instead of `__hash__` to avoid a circular recursion
    scenario with the Memory store for rdflib which requires a hash lookup
    in order to return a generator of triples.
    """
    return _TripleCanonicalizer(self).to_hash(stats=stats)

graph_diff

graph_diff(g1: Graph, g2: Graph) -> Tuple[Graph, Graph, Graph]

Returns three sets of triples: “in both”, “in first” and “in second”.

Source code in rdflib/compare.py
def graph_diff(g1: Graph, g2: Graph) -> Tuple[Graph, Graph, Graph]:
    """Returns three sets of triples: "in both", "in first" and "in second"."""
    # bnodes have deterministic values in canonical graphs:
    cg1 = to_canonical_graph(g1)
    cg2 = to_canonical_graph(g2)
    in_both = cg1 * cg2
    in_first = cg1 - cg2
    in_second = cg2 - cg1
    return (in_both, in_first, in_second)

isomorphic

isomorphic(graph1: Graph, graph2: Graph) -> bool

Compare graph for equality.

Uses an algorithm to compute unique hashes which takes bnodes into account.

Example
>>> g1 = Graph().parse(format='n3', data='''
...     @prefix : <http://example.org/ns#> .
...     <http://example.org> :rel <http://example.org/a> .
...     <http://example.org> :rel <http://example.org/b> .
...     <http://example.org> :rel [ :label "A bnode." ] .
... ''')
>>> g2 = Graph().parse(format='n3', data='''
...     @prefix ns: <http://example.org/ns#> .
...     <http://example.org> ns:rel [ ns:label "A bnode." ] .
...     <http://example.org> ns:rel <http://example.org/b>,
...             <http://example.org/a> .
... ''')
>>> isomorphic(g1, g2)
True
>>> g3 = Graph().parse(format='n3', data='''
...     @prefix : <http://example.org/ns#> .
...     <http://example.org> :rel <http://example.org/a> .
...     <http://example.org> :rel <http://example.org/b> .
...     <http://example.org> :rel <http://example.org/c> .
... ''')
>>> isomorphic(g1, g3)
False
Source code in rdflib/compare.py
def isomorphic(graph1: Graph, graph2: Graph) -> bool:
    """Compare graph for equality.

    Uses an algorithm to compute unique hashes which takes bnodes into account.

    Example:
        ```python
        >>> g1 = Graph().parse(format='n3', data='''
        ...     @prefix : <http://example.org/ns#> .
        ...     <http://example.org> :rel <http://example.org/a> .
        ...     <http://example.org> :rel <http://example.org/b> .
        ...     <http://example.org> :rel [ :label "A bnode." ] .
        ... ''')
        >>> g2 = Graph().parse(format='n3', data='''
        ...     @prefix ns: <http://example.org/ns#> .
        ...     <http://example.org> ns:rel [ ns:label "A bnode." ] .
        ...     <http://example.org> ns:rel <http://example.org/b>,
        ...             <http://example.org/a> .
        ... ''')
        >>> isomorphic(g1, g2)
        True
        >>> g3 = Graph().parse(format='n3', data='''
        ...     @prefix : <http://example.org/ns#> .
        ...     <http://example.org> :rel <http://example.org/a> .
        ...     <http://example.org> :rel <http://example.org/b> .
        ...     <http://example.org> :rel <http://example.org/c> .
        ... ''')
        >>> isomorphic(g1, g3)
        False

        ```
    """
    gd1 = _TripleCanonicalizer(graph1).to_hash()
    gd2 = _TripleCanonicalizer(graph2).to_hash()
    return gd1 == gd2

similar

similar(g1: Graph, g2: Graph)

Checks if the two graphs are “similar”.

Checks if the two graphs are “similar”, by comparing sorted triples where all bnodes have been replaced by a singular mock bnode (the _MOCK_BNODE).

This is a much cheaper, but less reliable, alternative to the comparison algorithm in isomorphic.

Source code in rdflib/compare.py
def similar(g1: Graph, g2: Graph):
    """Checks if the two graphs are "similar".

    Checks if the two graphs are "similar", by comparing sorted triples where
    all bnodes have been replaced by a singular mock bnode (the
    `_MOCK_BNODE`).

    This is a much cheaper, but less reliable, alternative to the comparison
    algorithm in `isomorphic`.
    """
    return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2))

to_canonical_graph

to_canonical_graph(g1: Graph, stats: Optional[Stats] = None) -> ReadOnlyGraphAggregate

Creates a canonical, read-only graph.

Creates a canonical, read-only graph where all bnode id:s are based on deterministical SHA-256 checksums, correlated with the graph contents.

Source code in rdflib/compare.py
def to_canonical_graph(
    g1: Graph, stats: Optional[Stats] = None
) -> ReadOnlyGraphAggregate:
    """Creates a canonical, read-only graph.

    Creates a canonical, read-only graph where all bnode id:s are based on
    deterministical SHA-256 checksums, correlated with the graph contents.
    """
    graph = Graph()
    graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats)
    return ReadOnlyGraphAggregate([graph])

to_isomorphic

to_isomorphic(graph: Graph) -> IsomorphicGraph
Source code in rdflib/compare.py
def to_isomorphic(graph: Graph) -> IsomorphicGraph:
    if isinstance(graph, IsomorphicGraph):
        return graph
    result = IsomorphicGraph()
    if hasattr(graph, "identifier"):
        result = IsomorphicGraph(identifier=graph.identifier)
    result += graph
    return result