diff --git a/environment.yml b/environment.yml index 24baea8..e222744 100644 --- a/environment.yml +++ b/environment.yml @@ -46,6 +46,10 @@ dependencies: - pre-commit # For testing - pytest-cov + # For benchmarking + - requests # For debugging - icecream - ipython + # For type annotations + - mypy diff --git a/scripts/bench.py b/scripts/bench.py new file mode 100755 index 0000000..4221514 --- /dev/null +++ b/scripts/bench.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python +import argparse +import json +import os +import statistics +import sys +import timeit + +import download_data +import graphblas as gb +import networkx as nx +import numpy as np +import scipy.sparse + +import graphblas_algorithms as ga +import scipy_impl +from graphblas_algorithms.interface import Dispatcher as ga_dispatcher + +thisdir = os.path.dirname(__file__) +datapaths = [ + os.path.join(thisdir, "..", "data"), + os.path.curdir, +] + + +def find_data(dataname): + if os.path.exists(dataname): + return os.path.relpath(dataname) + for path in datapaths: + path = os.path.join(path, dataname) + ".mtx" + if os.path.exists(path): + return os.path.relpath(path) + path = path.removesuffix(".mtx") + if os.path.exists(path): + return os.path.relpath(path) + if dataname in download_data.data_urls: + return os.path.relpath(download_data.main([dataname])[0]) + raise FileNotFoundError(f"Unable to find data file for {dataname}") + + +def get_symmetry(file_or_mminfo): + if not isinstance(file_or_mminfo, tuple): + mminfo = scipy.io.mminfo(file_or_mminfo) + else: + mminfo = file_or_mminfo + return mminfo[5] + + +def readfile(filename, is_symmetric, backend): + name = filename.split(".", 1)[0].rsplit("/", 1)[0] + if backend == "graphblas": + A = gb.io.mmread(filename, name=name) + A.wait() + if is_symmetric: + return ga.Graph(A) + return ga.DiGraph(A) + a = scipy.io.mmread(filename) + if backend == "networkx": + create_using = nx.Graph if is_symmetric else nx.DiGraph + return nx.from_scipy_sparse_array(a, create_using=create_using) + if backend == "scipy": + return scipy.sparse.csr_array(a) + raise ValueError( + f"Backend {backend!r} not understood; must be 'graphblas', 'networkx', or 'scipy'" + ) + + +def best_units(num): + """Returns scale factor and prefix such that 1 <= num*scale < 1000""" + if num < 1e-12: + return 1e15, "f" + if num < 1e-9: + return 1e12, "p" + if num < 1e-6: + return 1e9, "n" + if num < 1e-3: + return 1e6, "\N{MICRO SIGN}" + if num < 1: + return 1e3, "m" + if num < 1e3: + return 1.0, "" + if num < 1e6: + return 1e-3, "k" + if num < 1e9: + return 1e-6, "M" + if num < 1e12: + return 1e-9, "G" + return 1e-12, "T" + + +def stime(time): + scale, units = best_units(time) + return f"{time * scale:4.3g} {units}s" + + +# Functions that aren't available in the main networkx namespace +functionpaths = { + "inter_community_edges": "community.quality.inter_community_edges", + "intra_community_edges": "community.quality.intra_community_edges", + "is_tournament": "tournament.is_tournament", + "mutual_weight": "structuralholes.mutual_weight", + "score_sequence": "tournament.score_sequence", + "tournament_matrix": "tournament.tournament_matrix", +} +functioncall = { + "s_metric": "func(G, normalized=False)", +} +poweriteration = {"eigenvector_centrality", "katz_centrality", "pagerank"} +directed_only = { + "in_degree_centrality", + "is_tournament", + "out_degree_centrality", + "score_sequence", + "tournament_matrix", + "reciprocity", + "overall_reciprocity", +} +# Is square_clustering undirected only? graphblas-algorthms doesn't implement it for directed +undirected_only = {"generalized_degree", "k_truss", "triangles", "square_clustering"} + + +def getfunction(functionname, backend): + if backend == "graphblas": + return getattr(ga_dispatcher, functionname) + if backend == "scipy": + return getattr(scipy_impl, functionname) + if functionname in functionpaths: + func = nx + for attr in functionpaths[functionname].split("."): + func = getattr(func, attr) + return func + return getattr(nx, functionname) + + +def main(dataname, backend, functionname, time=3.0, n=None, extra=None, display=True): + filename = find_data(dataname) + is_symmetric = get_symmetry(filename) == "symmetric" + if not is_symmetric and functionname in undirected_only: + # Should we automatically symmetrize? + raise ValueError( + f"Data {dataname!r} is not symmetric, but {functionname} only works on undirected" + ) + if is_symmetric and functionname in directed_only: + is_symmetric = False # Make into directed graph + G = readfile(filename, is_symmetric, backend) + func = getfunction(functionname, backend) + benchstring = functioncall.get(functionname, "func(G)") + if extra is not None: + benchstring = f"{benchstring[:-1]}, {extra})" + globals = {"func": func, "G": G} + if functionname in poweriteration: + benchstring = f"try:\n {benchstring}\nexcept exc:\n pass" + globals["exc"] = nx.PowerIterationFailedConvergence + if backend == "graphblas": + benchstring = f"G._cache.clear()\n{benchstring}" + timer = timeit.Timer(benchstring, globals=globals) + if display: + line = f"Backend = {backend}, function = {functionname}, data = {dataname}" + if extra is not None: + line += f", extra = {extra}" + print("=" * len(line)) + print(line) + print("-" * len(line)) + info = {"backend": backend, "function": functionname, "data": dataname} + if extra is not None: + info["extra"] = extra + try: + first_time = timer.timeit(1) + except Exception as exc: + if display: + print(f"EXCEPTION: {exc}") + print("=" * len(line)) + raise + info["exception"] = str(exc) + return info + if time == 0: + n = 1 + elif n is None: + n = 2 ** max(0, int(np.ceil(np.log2(time / first_time)))) + if display: + print("Number of runs:", n) + print("first: ", stime(first_time)) + info["n"] = n + info["first"] = first_time + if n > 1: + results = timer.repeat(n - 1, 1) + results.append(first_time) + if display: + print("median:", stime(statistics.median(results))) + print("mean: ", stime(statistics.mean(results))) + print("stdev: ", stime(statistics.stdev(results))) + print("min: ", stime(min(results))) + print("max: ", stime(max(results))) + info["median"] = statistics.median(results) + info["mean"] = statistics.mean(results) + info["stdev"] = statistics.stdev(results) + info["min"] = min(results) + info["max"] = max(results) + if display: + print("=" * len(line)) + return info + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=f"Example usage: python {sys.argv[0]} -b graphblas -f pagerank -d amazon0302" + ) + parser.add_argument( + "-b", "--backend", choices=["graphblas", "networkx", "scipy"], default="graphblas" + ) + parser.add_argument( + "-t", "--time", type=float, default=3.0, help="Target minimum time to run benchmarks" + ) + parser.add_argument( + "-n", + type=int, + help="The number of times to run the benchmark (the default is to run according to time)", + ) + parser.add_argument( + "-d", + "--data", + required=True, + help="The path to a mtx file or one of the following data names: {" + + ", ".join(sorted(download_data.data_urls)) + + "}; data will be downloaded if necessary", + ) + parser.add_argument( + "-j", + "--json", + action="store_true", + help="Print results as json instead of human-readable text", + ) + parser.add_argument("-f", "--func", required=True, help="Which function to benchmark") + parser.add_argument("--extra", help="Extra string to add to the function call") + args = parser.parse_args() + info = main( + args.data, + args.backend, + args.func, + time=args.time, + n=args.n, + extra=args.extra, + display=not args.json, + ) + if args.json: + print(json.dumps(info)) diff --git a/scripts/bench_pagerank.py b/scripts/bench_pagerank.py deleted file mode 100644 index 512d829..0000000 --- a/scripts/bench_pagerank.py +++ /dev/null @@ -1,250 +0,0 @@ -import click -import networkx as nx - - -def best_units(num): - """Returns scale factor and prefix such that 1 <= num*scale < 1000""" - if num < 1e-12: - return 1e15, "f" - if num < 1e-9: - return 1e12, "p" - if num < 1e-6: - return 1e9, "n" - if num < 1e-3: - return 1e6, "u" - if num < 1: - return 1e3, "m" - if num < 1e3: - return 1.0, "" - if num < 1e6: - return 1e-3, "k" - if num < 1e9: - return 1e-6, "M" - if num < 1e12: - return 1e-9, "G" - return 1e-12, "T" - - -def stime(time): - scale, units = best_units(time) - return f"{time * scale:4.3g} {units}s" - - -# Copied and modified from networkx -def pagerank_scipy( - A, - alpha=0.85, - personalization=None, - max_iter=100, - tol=1.0e-6, - nstart=None, - weight="weight", - dangling=None, -): - import numpy as np - import scipy as sp - import scipy.sparse # call as sp.sparse - - N = A.shape[0] - if A.nnz == 0: - return {} - - # nodelist = list(G) - S = A.sum(axis=1) - S[S != 0] = 1.0 / S[S != 0] - # TODO: csr_array - Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) - A = Q @ A - - # initial vector - if nstart is None: - x = np.repeat(1.0 / N, N) - else: - raise NotImplementedError() - # Personalization vector - if personalization is None: - p = np.repeat(1.0 / N, N) - else: - raise NotImplementedError() - # Dangling nodes - if dangling is None: - dangling_weights = p - else: - raise NotImplementedError() - is_dangling = np.where(S == 0)[0] - - # power iteration: make up to max_iter iterations - for _ in range(max_iter): - xlast = x - x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p - # check convergence, l1 norm - err = np.absolute(x - xlast).sum() - if err < N * tol: - return x - # return dict(zip(nodelist, map(float, x))) - raise nx.PowerIterationFailedConvergence(max_iter) - - -@click.command() -@click.argument("filename") -@click.option( - "-b", - "--backend", - default="graphblas", - type=click.Choice(["graphblas", "networkx", "scipy", "gb", "nx", "sp", "gbnx"]), -) -@click.option( - "-t", - "--time", - default=3, - type=click.FloatRange(min=0, min_open=True), -) -@click.option( - "-n", - default=None, - type=click.IntRange(min=1), -) -@click.option( - "--verify", - is_flag=True, -) -@click.option( - "--alpha", - default=0.85, - type=click.FloatRange(min=0, max=1), -) -@click.option( - "--tol", - default=1e-06, - type=click.FloatRange(min=0, min_open=True), -) -def main(filename, backend, time, n, verify, alpha, tol, _get_result=False): - import statistics - import timeit - import warnings - - import numpy as np - - warnings.simplefilter("ignore") - if verify: - gb_result = main.callback(filename, "gb", None, None, False, alpha, tol, _get_result=True) - sp_result = main.callback(filename, "sp", None, None, False, alpha, tol, _get_result=True) - rtol = tol / gb_result.size - atol = 1e-16 - np.testing.assert_allclose(gb_result, sp_result, rtol=rtol, atol=atol) - print(" |- graphblas and scipy.sparse match") - nx_result = main.callback(filename, "nx", None, None, False, alpha, tol, _get_result=True) - np.testing.assert_allclose(gb_result, nx_result, rtol=rtol, atol=atol) - print(" |- graphblas and networkx match") - np.testing.assert_allclose(sp_result, nx_result, rtol=rtol, atol=atol) - print(" |- scipy.sparse and networkx match") - gbnx_result = main.callback( - filename, "gbnx", None, None, False, alpha, tol, _get_result=True - ) - np.testing.assert_allclose(gbnx_result, gb_result, rtol=rtol, atol=atol) - np.testing.assert_allclose(gbnx_result, sp_result, rtol=rtol, atol=atol) - np.testing.assert_allclose(gbnx_result, nx_result, rtol=rtol, atol=atol) - print("All good!") - # Show a grid of total absolute differences between results - results = { - "gb": gb_result, - "sp": sp_result, - "nx": nx_result, - "gbnx": gbnx_result, - } - print(" ", end="") - for k1 in results: - print("%9s" % k1, end="") - print() - for k1, v1 in results.items(): - print("%5s" % k1, end="") - for v2 in results.values(): - print("%9.2g" % np.abs(v1 - v2).sum(), end="") - print() - return - - backend = { - "gb": "graphblas", - "nx": "networkx", - "sp": "scipy", - }.get(backend, backend) - print(f"Filename: {filename} ; backend: {backend}") - - if backend == "graphblas": - import pandas as pd - from graphblas import Matrix - - from graphblas_algorithms.link_analysis import pagerank_core as pagerank - - start = timeit.default_timer() - df = pd.read_csv(filename, delimiter="\t", names=["row", "col"]) - G = Matrix.from_coo(df["row"].values, df["col"].values, 1) - stop = timeit.default_timer() - num_nodes = G.nrows - num_edges = G.nvals - if _get_result: - result = pagerank(G, alpha=alpha, tol=tol) - result(~result.S) << 0 # Densify just in case - return result.to_coo()[1] - - elif backend == "scipy": - import pandas as pd - import scipy.sparse - - start = timeit.default_timer() - df = pd.read_csv(filename, delimiter="\t", names=["row", "col"]) - G = scipy.sparse.csr_array((np.repeat(1.0, len(df)), (df["row"].values, df["col"].values))) - pagerank = pagerank_scipy - stop = timeit.default_timer() - num_nodes = G.shape[0] - num_edges = G.nnz - if _get_result: - return pagerank(G, alpha=alpha, tol=tol) - else: - if backend == "networkx": - from networkx import pagerank - else: - from graphblas_algorithms.link_analysis import pagerank - - start = timeit.default_timer() - G = nx.read_edgelist(filename, delimiter="\t", nodetype=int, create_using=nx.DiGraph) - N = max(G) - for i in range(N): - if i not in G: - G.add_node(i) - stop = timeit.default_timer() - num_nodes = len(G.nodes) - num_edges = len(G.edges) - - if _get_result: - result = pagerank(G, alpha=alpha, tol=tol) - return np.array([result.get(key, 0) for key in range(N + 1)]) - - print("Num nodes:", num_nodes) - print("Num edges:", num_edges) - print("Load time:", stime(stop - start)) - timer = timeit.Timer( - "pagerank(G, alpha=alpha, tol=tol)", - globals={"pagerank": pagerank, "G": G, "alpha": alpha, "tol": tol}, - ) - first_time = timer.timeit(1) - if time == 0: - n = 1 - elif n is None: - n = 2 ** max(0, int(np.ceil(np.log2(time / first_time)))) - print("Number of runs:", n) - print("first: ", stime(first_time)) - if n > 1: - results = timer.repeat(n - 1, 1) - results.append(first_time) - print("median:", stime(statistics.median(results))) - print("mean: ", stime(statistics.mean(results))) - # print("hmean: ", stime(statistics.harmonic_mean(results))) - # print("gmean: ", stime(statistics.geometric_mean(results))) - print("stdev: ", stime(statistics.stdev(results))) - print("min: ", stime(min(results))) - print("max: ", stime(max(results))) - - -if __name__ == "__main__": - main() diff --git a/scripts/download_data.py b/scripts/download_data.py new file mode 100755 index 0000000..9b00ea1 --- /dev/null +++ b/scripts/download_data.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +import argparse +import functools +import gzip +import io +import os +import sys +import tarfile + +import requests + +thisdir = os.path.dirname(__file__) +datapath = functools.partial(os.path.join, thisdir, "..", "data") + +data_urls = { + "amazon0302": "https://sparse.tamu.edu/MM/SNAP/amazon0302.tar.gz", + "web-Google": "https://sparse.tamu.edu/MM/SNAP/web-Google.tar.gz", + "soc-Pokec": "https://sparse.tamu.edu/MM/SNAP/soc-Pokec.tar.gz", + "email-Enron": "https://sparse.tamu.edu/MM/SNAP/email-Enron.tar.gz", + "preferentialAttachment": "https://sparse.tamu.edu/MM/DIMACS10/preferentialAttachment.tar.gz", + "caidaRouterLevel": "https://sparse.tamu.edu/MM/DIMACS10/caidaRouterLevel.tar.gz", + "dblp-2010": "https://sparse.tamu.edu/MM/LAW/dblp-2010.tar.gz", + "citationCiteseer": "https://sparse.tamu.edu/MM/DIMACS10/citationCiteseer.tar.gz", + "coAuthorsDBLP": "https://sparse.tamu.edu/MM/DIMACS10/coAuthorsDBLP.tar.gz", + "as-Skitter": "https://sparse.tamu.edu/MM/SNAP/as-Skitter.tar.gz", + "coPapersCiteseer": "https://sparse.tamu.edu/MM/DIMACS10/coPapersCiteseer.tar.gz", + "coPapersDBLP": "https://sparse.tamu.edu/MM/DIMACS10/coPapersDBLP.tar.gz", +} + + +def download(url, target=None): + req = requests.request("GET", url) + assert req.ok, req.reason + tar = tarfile.open(fileobj=io.BytesIO(gzip.decompress(req.content))) + for member in tar.members: + dirname, basename = os.path.split(member.name) + if not basename.endswith(".mtx"): + continue + tar.extract(member) + if target: + os.makedirs(os.path.dirname(target), exist_ok=True) + os.replace(member.name, target) + os.removedirs(dirname) + + +def main(datanames, overwrite=False): + filenames = [] + for name in datanames: + target = datapath(f"{name}.mtx") + filenames.append(target) + relpath = os.path.relpath(target) + if not overwrite and os.path.exists(target): + print(f"{relpath} already exists; skipping", file=sys.stderr) + continue + url = data_urls[name] + print(f"Downloading {relpath} from {url}", file=sys.stderr) + download(url, target) + return filenames + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("datanames", nargs="*", choices=list(data_urls) + [[]]) + args = parser.parse_args() + datanames = args.datanames + if not datanames: + # None specified, so download all that are missing + datanames = data_urls + overwrite = False + else: + overwrite = True + main(datanames, overwrite=overwrite) diff --git a/scripts/scipy_impl.py b/scripts/scipy_impl.py new file mode 100644 index 0000000..06c9d81 --- /dev/null +++ b/scripts/scipy_impl.py @@ -0,0 +1,55 @@ +import networkx as nx +import numpy as np +import scipy as sp +import scipy.sparse # call as sp.sparse + + +def pagerank( + A, + alpha=0.85, + personalization=None, + max_iter=100, + tol=1.0e-6, + nstart=None, + weight="weight", + dangling=None, +): + + N = A.shape[0] + if A.nnz == 0: + return {} + + # nodelist = list(G) + S = A.sum(axis=1) + S[S != 0] = 1.0 / S[S != 0] + # TODO: csr_array + Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) + A = Q @ A + + # initial vector + if nstart is None: + x = np.repeat(1.0 / N, N) + else: + raise NotImplementedError() + # Personalization vector + if personalization is None: + p = np.repeat(1.0 / N, N) + else: + raise NotImplementedError() + # Dangling nodes + if dangling is None: + dangling_weights = p + else: + raise NotImplementedError() + is_dangling = np.where(S == 0)[0] + + # power iteration: make up to max_iter iterations + for _ in range(max_iter): + xlast = x + x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p + # check convergence, l1 norm + err = np.absolute(x - xlast).sum() + if err < N * tol: + return x + # return dict(zip(nodelist, map(float, x))) + raise nx.PowerIterationFailedConvergence(max_iter)