dwww Home | Show directory contents | Find package

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Roget\n\nBuild a directed graph of 1022 categories and 5075 cross-references as defined\nin the 1879 version of Roget's Thesaurus. This example is described in Section\n1.2 of\n\n    Donald E. Knuth, \"The Stanford GraphBase: A Platform for Combinatorial\n    Computing\", ACM Press, New York, 1993.\n    http://www-cs-faculty.stanford.edu/~knuth/sgb.html\n\nNote that one of the 5075 cross references is a self loop yet it is included in\nthe graph built here because the standard networkx `DiGraph` class allows self\nloops.  (cf. 400pungency:400 401 403 405).\n\nThe data file can be found at:\n\n- https://github.com/networkx/networkx/blob/main/examples/graph/roget_dat.txt.gz\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import gzip\nimport re\nimport sys\n\nimport matplotlib.pyplot as plt\nimport networkx as nx\n\n\ndef roget_graph():\n    \"\"\"Return the thesaurus graph from the roget.dat example in\n    the Stanford Graph Base.\n    \"\"\"\n    # open file roget_dat.txt.gz\n    fh = gzip.open(\"roget_dat.txt.gz\", \"r\")\n\n    G = nx.DiGraph()\n\n    for line in fh.readlines():\n        line = line.decode()\n        if line.startswith(\"*\"):  # skip comments\n            continue\n        if line.startswith(\" \"):  # this is a continuation line, append\n            line = oldline + line\n        if line.endswith(\"\\\\\\n\"):  # continuation line, buffer, goto next\n            oldline = line.strip(\"\\\\\\n\")\n            continue\n\n        (headname, tails) = line.split(\":\")\n\n        # head\n        numfind = re.compile(r\"^\\d+\")  # re to find the number of this word\n        head = numfind.findall(headname)[0]  # get the number\n\n        G.add_node(head)\n\n        for tail in tails.split():\n            if head == tail:\n                print(\"skipping self loop\", head, tail, file=sys.stderr)\n            G.add_edge(head, tail)\n\n    return G\n\n\nG = roget_graph()\nprint(\"Loaded roget_dat.txt containing 1022 categories.\")\nprint(G)\nUG = G.to_undirected()\nprint(nx.number_connected_components(UG), \"connected components\")\n\noptions = {\n    \"node_color\": \"black\",\n    \"node_size\": 1,\n    \"edge_color\": \"gray\",\n    \"linewidths\": 0,\n    \"width\": 0.1,\n}\nnx.draw_circular(UG, **options)\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.7"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}

Generated by dwww version 1.16 on Wed Apr 8 10:59:17 CEST 2026.