package iskra import ( "bytes" "fmt" "os" "path/filepath" "git.smesh.lol/iskradb/lattice" ) // CorpusSegment represents one row from manifest.csv. type CorpusSegment struct { ID string Kind string Name string ASTFile string ASTDump string IRFile string ASMFile string BINFile string Lineinfo string } // LoadManifest reads a tab-separated manifest.csv produced by mxcorpus. // Format: id\tkind\tname\tast_file\tir_O0\tasm_O0\tbin_O0\tlineinfo func LoadManifest(path string) ([]CorpusSegment, error) { data, err := os.ReadFile(path) if err != nil { return nil, err } lines := splitLines(data) if len(lines) < 2 { return nil, fmt.Errorf("manifest has no data rows") } segments := []CorpusSegment{:0:len(lines) - 1} for i := 1; i < len(lines); i++ { line := lines[i] if len(line) == 0 { continue } fields := splitTabs(line) if len(fields) < 4 { continue } seg := CorpusSegment{ ID: string(fields[0]), Kind: string(fields[1]), Name: string(fields[2]), ASTFile: string(fields[3]), } if len(fields) > 4 { seg.ASTDump = string(fields[4]) } if len(fields) > 5 { seg.IRFile = string(fields[5]) } if len(fields) > 6 { seg.ASMFile = string(fields[6]) } if len(fields) > 7 { seg.BINFile = string(fields[7]) } if len(fields) > 8 { seg.Lineinfo = string(fields[8]) } segments = append(segments, seg) } return segments, nil } // LoadCorpus reads a manifest.csv and all referenced segment files into // an in-memory iskradb tree. Call StorageFlush to persist. func LoadCorpus(corpusDir string) (*Tree, error) { db := lattice.NewTree(256) t := NewTree(db) if err := LoadCorpusInto(corpusDir, t); err != nil { return nil, err } FinalizeCorpus(t) return t, nil } // LoadCorpusStages loads only the specified stages from a corpus. // stages is a bitmask: (1< 0 { modName := "__module__" if len(segments) > 0 { modName = segments[0].Name | ".__module__" } InsertSegment(t, StageIR, KindPkg, modName, modData) } } return nil } // FinalizeCorpus wires cross-branch links after all insertions are complete. func FinalizeCorpus(t *Tree) { t.FinalizeLinks() } // InsertSegment inserts a single segment into the tree. // Returns the record index (recIdx) for further operations. func InsertSegment(t *Tree, stage uint8, kind NodeKind, name string, content []byte) uint32 { // Use name-based key so cross-stage lookup (StageLinksFor) can find the // same named entity at different stages by same hash + different stage prefix. key := NameKey(stage, name) branch := KindToBranch(kind) recIdx := t.Insert(branch, key, name, kind, stage) t.SetContent(recIdx, content) if stage == StageAST { st := ExtractSymbols(string(content)) sig := SignatureWithTypes(st) t.RecMeta[recIdx].SetSigHash(SignatureHash24(sig)) } return recIdx } func parseKind(s string) NodeKind { switch s { case "pkg": return KindPkg case "import": return KindImport case "const": return KindConst case "var": return KindVar case "type": return KindType case "func": return KindFunc case "method": return KindMethod default: return KindUnknown } } func splitLines(data []byte) [][]byte { var lines [][]byte for len(data) > 0 { idx := bytes.IndexByte(data, '\n') if idx < 0 { if len(data) > 0 { lines = append(lines, data) } break } line := data[:idx] if len(line) > 0 && line[len(line)-1] == '\r' { line = line[:len(line)-1] } lines = append(lines, line) data = data[idx+1:] } return lines } func splitTabs(line []byte) [][]byte { var fields [][]byte for len(line) > 0 { idx := bytes.IndexByte(line, '\t') if idx < 0 { fields = append(fields, line) break } fields = append(fields, line[:idx]) line = line[idx+1:] } return fields }