corpus.mx raw
1 package iskra
2
3 import (
4 "bytes"
5 "fmt"
6 "os"
7 "path/filepath"
8
9 "git.smesh.lol/iskradb/lattice"
10 )
11
12 // CorpusSegment represents one row from manifest.csv.
13 type CorpusSegment struct {
14 ID string
15 Kind string
16 Name string
17 ASTFile string
18 ASTDump string
19 IRFile string
20 ASMFile string
21 BINFile string
22 Lineinfo string
23 }
24
25 // LoadManifest reads a tab-separated manifest.csv produced by mxcorpus.
26 // Format: id\tkind\tname\tast_file\tir_O0\tasm_O0\tbin_O0\tlineinfo
27 func LoadManifest(path string) ([]CorpusSegment, error) {
28 data, err := os.ReadFile(path)
29 if err != nil {
30 return nil, err
31 }
32
33 lines := splitLines(data)
34 if len(lines) < 2 {
35 return nil, fmt.Errorf("manifest has no data rows")
36 }
37
38 segments := []CorpusSegment{:0:len(lines) - 1}
39 for i := 1; i < len(lines); i++ {
40 line := lines[i]
41 if len(line) == 0 {
42 continue
43 }
44 fields := splitTabs(line)
45 if len(fields) < 4 {
46 continue
47 }
48 seg := CorpusSegment{
49 ID: string(fields[0]),
50 Kind: string(fields[1]),
51 Name: string(fields[2]),
52 ASTFile: string(fields[3]),
53 }
54 if len(fields) > 4 {
55 seg.ASTDump = string(fields[4])
56 }
57 if len(fields) > 5 {
58 seg.IRFile = string(fields[5])
59 }
60 if len(fields) > 6 {
61 seg.ASMFile = string(fields[6])
62 }
63 if len(fields) > 7 {
64 seg.BINFile = string(fields[7])
65 }
66 if len(fields) > 8 {
67 seg.Lineinfo = string(fields[8])
68 }
69 segments = append(segments, seg)
70 }
71 return segments, nil
72 }
73
74 // LoadCorpus reads a manifest.csv and all referenced segment files into
75 // an in-memory iskradb tree. Call StorageFlush to persist.
76 func LoadCorpus(corpusDir string) (*Tree, error) {
77 db := lattice.NewTree(256)
78 t := NewTree(db)
79 if err := LoadCorpusInto(corpusDir, t); err != nil {
80 return nil, err
81 }
82 FinalizeCorpus(t)
83 return t, nil
84 }
85
86 // LoadCorpusStages loads only the specified stages from a corpus.
87 // stages is a bitmask: (1<<StageSRC) | (1<<StageAST) etc.
88 func LoadCorpusStages(corpusDir string, t *Tree, stages uint8) error {
89 return loadCorpusFiltered(corpusDir, t, stages)
90 }
91
92 // LoadCorpusInto loads a corpus into an existing tree.
93 func LoadCorpusInto(corpusDir string, t *Tree) error {
94 return loadCorpusFiltered(corpusDir, t, 0xFF)
95 }
96
97 func loadCorpusFiltered(corpusDir string, t *Tree, stages uint8) error {
98 manifestPath := filepath.Join(corpusDir, "manifest.csv")
99 segments, err := LoadManifest(manifestPath)
100 if err != nil {
101 return err
102 }
103
104 // Build a lookup of segment files in .segments/ subdirs.
105 segmentPaths := map[string]string{}
106 dirEntries, _ := os.ReadDir(corpusDir)
107 for _, de := range dirEntries {
108 if de.IsDir() && bytes.HasSuffix([]byte(de.Name()), []byte(".segments")) {
109 subDir := filepath.Join(corpusDir, de.Name())
110 subEntries, _ := os.ReadDir(subDir)
111 for _, se := range subEntries {
112 segmentPaths[se.Name()] = filepath.Join(subDir, se.Name())
113 }
114 }
115 }
116
117 for _, seg := range segments {
118 kind := parseKind(seg.Kind)
119
120 if stages&(1<<StageSRC) != 0 && seg.ASTFile != "" {
121 astPath := segmentPaths[seg.ASTFile]
122 if astPath == "" {
123 astPath = filepath.Join(corpusDir, seg.ASTFile)
124 }
125 if astData, err := os.ReadFile(astPath); err == nil {
126 InsertSegment(t, StageSRC, kind, seg.Name, astData)
127 }
128 }
129
130 if stages&(1<<StageAST) != 0 && seg.ASTDump != "" {
131 astPath := filepath.Join(corpusDir, "ast", seg.ASTDump)
132 if astData, err := os.ReadFile(astPath); err == nil {
133 InsertSegment(t, StageAST, kind, seg.Name, astData)
134 }
135 }
136
137 if stages&(1<<StageIR) != 0 && seg.IRFile != "" {
138 irPath := filepath.Join(corpusDir, "ir", seg.IRFile)
139 if irData, err := os.ReadFile(irPath); err == nil {
140 InsertSegment(t, StageIR, kind, seg.Name, irData)
141 }
142 }
143
144 if stages&(1<<StageASM) != 0 && seg.ASMFile != "" {
145 asmPath := filepath.Join(corpusDir, "asm", seg.ASMFile)
146 if asmData, err := os.ReadFile(asmPath); err == nil {
147 InsertSegment(t, StageASM, kind, seg.Name, asmData)
148 }
149 }
150
151 if stages&(1<<StageBIN) != 0 && seg.BINFile != "" {
152 binPath := filepath.Join(corpusDir, "bin", seg.BINFile)
153 if binData, err := os.ReadFile(binPath); err == nil {
154 InsertSegment(t, StageBIN, kind, seg.Name, binData)
155 }
156 }
157 }
158
159 // Load full module IR when IR stage is included.
160 modPath := filepath.Join(corpusDir, "module.O0.ll")
161 if stages&(1<<StageIR) != 0 {
162 if modData, err := os.ReadFile(modPath); err == nil && len(modData) > 0 {
163 modName := "__module__"
164 if len(segments) > 0 {
165 modName = segments[0].Name | ".__module__"
166 }
167 InsertSegment(t, StageIR, KindPkg, modName, modData)
168 }
169 }
170
171 return nil
172 }
173
174 // FinalizeCorpus wires cross-branch links after all insertions are complete.
175 func FinalizeCorpus(t *Tree) {
176 t.FinalizeLinks()
177 }
178
179 // InsertSegment inserts a single segment into the tree.
180 // Returns the record index (recIdx) for further operations.
181 func InsertSegment(t *Tree, stage uint8, kind NodeKind, name string, content []byte) uint32 {
182 // Use name-based key so cross-stage lookup (StageLinksFor) can find the
183 // same named entity at different stages by same hash + different stage prefix.
184 key := NameKey(stage, name)
185 branch := KindToBranch(kind)
186 recIdx := t.Insert(branch, key, name, kind, stage)
187
188 t.SetContent(recIdx, content)
189
190 if stage == StageAST {
191 st := ExtractSymbols(string(content))
192 sig := SignatureWithTypes(st)
193 t.RecMeta[recIdx].SetSigHash(SignatureHash24(sig))
194 }
195
196 return recIdx
197 }
198
199 func parseKind(s string) NodeKind {
200 switch s {
201 case "pkg":
202 return KindPkg
203 case "import":
204 return KindImport
205 case "const":
206 return KindConst
207 case "var":
208 return KindVar
209 case "type":
210 return KindType
211 case "func":
212 return KindFunc
213 case "method":
214 return KindMethod
215 default:
216 return KindUnknown
217 }
218 }
219
220 func splitLines(data []byte) [][]byte {
221 var lines [][]byte
222 for len(data) > 0 {
223 idx := bytes.IndexByte(data, '\n')
224 if idx < 0 {
225 if len(data) > 0 {
226 lines = append(lines, data)
227 }
228 break
229 }
230 line := data[:idx]
231 if len(line) > 0 && line[len(line)-1] == '\r' {
232 line = line[:len(line)-1]
233 }
234 lines = append(lines, line)
235 data = data[idx+1:]
236 }
237 return lines
238 }
239
240 func splitTabs(line []byte) [][]byte {
241 var fields [][]byte
242 for len(line) > 0 {
243 idx := bytes.IndexByte(line, '\t')
244 if idx < 0 {
245 fields = append(fields, line)
246 break
247 }
248 fields = append(fields, line[:idx])
249 line = line[idx+1:]
250 }
251 return fields
252 }
253