{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n# Roget\n\nBuild a directed graph of 1022 categories and 5075 cross-references as defined\nin the 1879 version of Roget's Thesaurus. This example is described in Section\n1.2 of\n\n Donald E. Knuth, \"The Stanford GraphBase: A Platform for Combinatorial\n Computing\", ACM Press, New York, 1993.\n http://www-cs-faculty.stanford.edu/~knuth/sgb.html\n\nNote that one of the 5075 cross references is a self loop yet it is included in\nthe graph built here because the standard networkx `DiGraph` class allows self\nloops. (cf. 400pungency:400 401 403 405).\n\nThe data file can be found at:\n\n- https://github.com/networkx/networkx/blob/main/examples/graph/roget_dat.txt.gz\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import gzip\nimport re\nimport sys\n\nimport matplotlib.pyplot as plt\nimport networkx as nx\n\n\ndef roget_graph():\n \"\"\"Return the thesaurus graph from the roget.dat example in\n the Stanford Graph Base.\n \"\"\"\n # open file roget_dat.txt.gz\n fh = gzip.open(\"roget_dat.txt.gz\", \"r\")\n\n G = nx.DiGraph()\n\n for line in fh.readlines():\n line = line.decode()\n if line.startswith(\"*\"): # skip comments\n continue\n if line.startswith(\" \"): # this is a continuation line, append\n line = oldline + line\n if line.endswith(\"\\\\\\n\"): # continuation line, buffer, goto next\n oldline = line.strip(\"\\\\\\n\")\n continue\n\n (headname, tails) = line.split(\":\")\n\n # head\n numfind = re.compile(r\"^\\d+\") # re to find the number of this word\n head = numfind.findall(headname)[0] # get the number\n\n G.add_node(head)\n\n for tail in tails.split():\n if head == tail:\n print(\"skipping self loop\", head, tail, file=sys.stderr)\n G.add_edge(head, tail)\n\n return G\n\n\nG = roget_graph()\nprint(\"Loaded roget_dat.txt containing 1022 categories.\")\nprint(G)\nUG = G.to_undirected()\nprint(nx.number_connected_components(UG), \"connected components\")\n\noptions = {\n \"node_color\": \"black\",\n \"node_size\": 1,\n \"edge_color\": \"gray\",\n \"linewidths\": 0,\n \"width\": 0.1,\n}\nnx.draw_circular(UG, **options)\nplt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Generated by dwww version 1.16 on Wed Apr 8 10:59:17 CEST 2026.