seed.ts raw

   1  #!/usr/bin/env bun
   2  /**
   3   * seed.ts — Seed a Neo4j database with Nostr events from JSONL.
   4   *
   5   * Creates the same graph structure that ORLY uses:
   6   *   Event nodes ──AUTHORED_BY──> NostrUser
   7   *   Event nodes ──REFERENCES───> Event      (e-tags)
   8   *   Event nodes ──MENTIONS─────> NostrUser  (p-tags)
   9   *   Event nodes ──TAGGED_WITH──> Tag        (t, d, r, etc.)
  10   *
  11   * Usage:
  12   *   bun run seed.ts                              # defaults: localhost, 500 events
  13   *   bun run seed.ts --uri bolt://host:7687       # custom URI
  14   *   bun run seed.ts --limit 2000                 # seed more events
  15   *   bun run seed.ts --all                        # seed the entire dataset (~11k events)
  16   *   bun run seed.ts --clean                      # wipe DB first
  17   */
  18  
  19  import neo4j from "neo4j-driver";
  20  import { parseArgs } from "util";
  21  import { resolve } from "path";
  22  import { homedir } from "os";
  23  
  24  // ── Defaults ────────────────────────────────────────────────────
  25  const DEFAULT_DATA = resolve(
  26    homedir(),
  27    "src/git.mleku.dev/mleku/nostr/encoders/event/examples/out.jsonl"
  28  );
  29  
  30  // ── Schema (matches ORLY's pkg/neo4j/schema.go) ────────────────
  31  const CONSTRAINTS = [
  32    "CREATE CONSTRAINT event_id_unique IF NOT EXISTS FOR (e:Event) REQUIRE e.id IS UNIQUE",
  33    "CREATE CONSTRAINT nostrUser_pubkey IF NOT EXISTS FOR (n:NostrUser) REQUIRE n.pubkey IS UNIQUE",
  34  ];
  35  
  36  const INDEXES = [
  37    "CREATE INDEX event_kind IF NOT EXISTS FOR (e:Event) ON (e.kind)",
  38    "CREATE INDEX event_created_at IF NOT EXISTS FOR (e:Event) ON (e.created_at)",
  39    "CREATE INDEX tag_type IF NOT EXISTS FOR (t:Tag) ON (t.type)",
  40    "CREATE INDEX tag_value IF NOT EXISTS FOR (t:Tag) ON (t.value)",
  41    "CREATE INDEX tag_type_value IF NOT EXISTS FOR (t:Tag) ON (t.type, t.value)",
  42    "CREATE INDEX event_kind_created_at IF NOT EXISTS FOR (e:Event) ON (e.kind, e.created_at)",
  43  ];
  44  
  45  // ── Parse CLI args ──────────────────────────────────────────────
  46  const { values: args } = parseArgs({
  47    options: {
  48      uri:      { type: "string", default: "bolt://localhost:7687" },
  49      user:     { type: "string", default: "neo4j" },
  50      password: { type: "string", default: "nostr-demo-2024" },
  51      data:     { type: "string", default: DEFAULT_DATA },
  52      limit:    { type: "string", default: "500" },
  53      all:      { type: "boolean", default: false },
  54      clean:    { type: "boolean", default: false },
  55      help:     { type: "boolean", default: false },
  56    },
  57  });
  58  
  59  if (args.help) {
  60    console.log(`
  61    seed.ts — Seed Neo4j with Nostr events
  62  
  63    Options:
  64      --uri <uri>          Neo4j bolt URI (default: bolt://localhost:7687)
  65      --user <user>        Neo4j username (default: neo4j)
  66      --password <pass>    Neo4j password (default: nostr-demo-2024)
  67      --data <path>        Path to JSONL event data
  68      --limit <n>          Max events to seed (default: 500)
  69      --all                Seed ALL events (ignores --limit)
  70      --clean              Wipe the database before seeding
  71      --help               Show this help
  72    `);
  73    process.exit(0);
  74  }
  75  
  76  const limit = args.all ? Infinity : parseInt(args.limit!, 10);
  77  
  78  // ── Load events from JSONL ──────────────────────────────────────
  79  interface NostrEvent {
  80    id: string;
  81    pubkey: string;
  82    kind: number;
  83    created_at: number;
  84    content: string;
  85    sig: string;
  86    tags: string[][];
  87  }
  88  
  89  async function loadEvents(path: string, max: number): Promise<NostrEvent[]> {
  90    const file = Bun.file(path);
  91    if (!(await file.exists())) {
  92      console.error(`Error: Data file not found: ${path}`);
  93      process.exit(1);
  94    }
  95  
  96    const text = await file.text();
  97    const lines = text.trim().split("\n");
  98    const events: NostrEvent[] = [];
  99    for (let i = 0; i < lines.length && i < max; i++) {
 100      events.push(JSON.parse(lines[i]));
 101    }
 102    return events;
 103  }
 104  
 105  // ── Main ────────────────────────────────────────────────────────
 106  async function main() {
 107    console.log(`Connecting to Neo4j at ${args.uri} ...`);
 108    const driver = neo4j.driver(
 109      args.uri!,
 110      neo4j.auth.basic(args.user!, args.password!)
 111    );
 112  
 113    try {
 114      await driver.verifyConnectivity();
 115    } catch (e: any) {
 116      console.error(`Cannot connect: ${e.message}`);
 117      console.error("Is Neo4j running? Try: docker ps | grep neo4j");
 118      process.exit(1);
 119    }
 120    console.log("  Connected.");
 121  
 122    const session = driver.session();
 123  
 124    try {
 125      // Clean if requested
 126      if (args.clean) {
 127        console.log("Wiping database ...");
 128        await session.run("MATCH (n) DETACH DELETE n");
 129        console.log("  Done.");
 130      }
 131  
 132      // Apply schema
 133      console.log("Applying schema ...");
 134      for (const cypher of [...CONSTRAINTS, ...INDEXES]) {
 135        await session.run(cypher);
 136      }
 137      console.log("  Schema applied (constraints + indexes).");
 138  
 139      // Load events
 140      console.log(`Loading events from ${args.data} ...`);
 141      const events = await loadEvents(args.data!, limit);
 142      console.log(`  Loaded ${events.length} events.`);
 143  
 144      // Seed in batches
 145      console.log("Seeding graph ...");
 146      const t0 = Date.now();
 147      const batchSize = 50;
 148  
 149      for (let i = 0; i < events.length; i += batchSize) {
 150        const batch = events.slice(i, i + batchSize);
 151        await session.executeWrite(async (tx) => {
 152          // Create Event + NostrUser nodes with AUTHORED_BY
 153          const batchData = batch.map((ev) => ({
 154            id: ev.id,
 155            kind: neo4j.int(ev.kind),
 156            created_at: neo4j.int(ev.created_at),
 157            content: ev.content || "",
 158            sig: ev.sig || "",
 159            pubkey: ev.pubkey,
 160            tags: JSON.stringify(ev.tags || []),
 161          }));
 162  
 163          await tx.run(
 164            `UNWIND $events AS ev
 165             MERGE (e:Event {id: ev.id})
 166             SET e.kind = ev.kind,
 167                 e.created_at = ev.created_at,
 168                 e.content = ev.content,
 169                 e.sig = ev.sig,
 170                 e.pubkey = ev.pubkey,
 171                 e.tags = ev.tags
 172             MERGE (a:NostrUser {pubkey: ev.pubkey})
 173             MERGE (e)-[:AUTHORED_BY]->(a)`,
 174            { events: batchData }
 175          );
 176  
 177          // Create tag relationships for each event
 178          for (const ev of batch) {
 179            for (const tag of ev.tags || []) {
 180              if (tag.length < 2 || !tag[1]) continue;
 181              const [tagType, tagValue] = tag;
 182  
 183              if (tagType === "e") {
 184                await tx.run(
 185                  `MATCH (src:Event {id: $srcId})
 186                   MERGE (tgt:Event {id: $refId})
 187                   MERGE (src)-[:REFERENCES]->(tgt)`,
 188                  { srcId: ev.id, refId: tagValue }
 189                );
 190              } else if (tagType === "p") {
 191                await tx.run(
 192                  `MATCH (src:Event {id: $srcId})
 193                   MERGE (mentioned:NostrUser {pubkey: $pubkey})
 194                   MERGE (src)-[:MENTIONS]->(mentioned)`,
 195                  { srcId: ev.id, pubkey: tagValue }
 196                );
 197              } else {
 198                await tx.run(
 199                  `MATCH (src:Event {id: $srcId})
 200                   MERGE (t:Tag {type: $type, value: $value})
 201                   MERGE (src)-[:TAGGED_WITH]->(t)`,
 202                  { srcId: ev.id, type: tagType, value: tagValue }
 203                );
 204              }
 205            }
 206          }
 207        });
 208  
 209        const done = Math.min(i + batchSize, events.length);
 210        const pct = Math.floor((done * 100) / events.length);
 211        process.stdout.write(`\r  Progress: ${done}/${events.length} (${pct}%)`);
 212      }
 213  
 214      const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
 215      console.log(`\n  Seeded ${events.length} events in ${elapsed}s`);
 216  
 217      // Print summary
 218      const stats = await session.run(`
 219        MATCH (e:Event) WITH count(e) AS events
 220        MATCH (a:NostrUser) WITH events, count(a) AS users
 221        MATCH (t:Tag) WITH events, users, count(t) AS tags
 222        RETURN events, users, tags
 223      `);
 224      const row = stats.records[0];
 225      console.log(`\n  Graph summary:`);
 226      console.log(`    Events:    ${row.get("events")}`);
 227      console.log(`    Users:     ${row.get("users")}`);
 228      console.log(`    Tags:      ${row.get("tags")}`);
 229  
 230      const rels = await session.run(`
 231        MATCH ()-[r:AUTHORED_BY]->() WITH count(r) AS authored
 232        MATCH ()-[r:REFERENCES]->() WITH authored, count(r) AS refs
 233        MATCH ()-[r:MENTIONS]->() WITH authored, refs, count(r) AS mentions
 234        MATCH ()-[r:TAGGED_WITH]->() WITH authored, refs, mentions, count(r) AS tagged
 235        RETURN authored, refs, mentions, tagged
 236      `);
 237      const rrow = rels.records[0];
 238      console.log(`    AUTHORED_BY: ${rrow.get("authored")}`);
 239      console.log(`    REFERENCES:  ${rrow.get("refs")}`);
 240      console.log(`    MENTIONS:    ${rrow.get("mentions")}`);
 241      console.log(`    TAGGED_WITH: ${rrow.get("tagged")}`);
 242  
 243      console.log("\nDone! The database is ready for Cypher queries.");
 244    } finally {
 245      await session.close();
 246      await driver.close();
 247    }
 248  }
 249  
 250  main();
 251