corpus.mx raw

   1  package iskra
   2  
   3  import (
   4  	"bytes"
   5  	"fmt"
   6  	"os"
   7  	"path/filepath"
   8  
   9  	"git.smesh.lol/iskradb/lattice"
  10  )
  11  
  12  // CorpusSegment represents one row from manifest.csv.
  13  type CorpusSegment struct {
  14  	ID       string
  15  	Kind     string
  16  	Name     string
  17  	ASTFile  string
  18  	ASTDump  string
  19  	IRFile   string
  20  	ASMFile  string
  21  	BINFile  string
  22  	Lineinfo string
  23  }
  24  
  25  // LoadManifest reads a tab-separated manifest.csv produced by mxcorpus.
  26  // Format: id\tkind\tname\tast_file\tir_O0\tasm_O0\tbin_O0\tlineinfo
  27  func LoadManifest(path string) ([]CorpusSegment, error) {
  28  	data, err := os.ReadFile(path)
  29  	if err != nil {
  30  		return nil, err
  31  	}
  32  
  33  	lines := splitLines(data)
  34  	if len(lines) < 2 {
  35  		return nil, fmt.Errorf("manifest has no data rows")
  36  	}
  37  
  38  	segments := []CorpusSegment{:0:len(lines) - 1}
  39  	for i := 1; i < len(lines); i++ {
  40  		line := lines[i]
  41  		if len(line) == 0 {
  42  			continue
  43  		}
  44  		fields := splitTabs(line)
  45  		if len(fields) < 4 {
  46  			continue
  47  		}
  48  		seg := CorpusSegment{
  49  			ID:      string(fields[0]),
  50  			Kind:    string(fields[1]),
  51  			Name:    string(fields[2]),
  52  			ASTFile: string(fields[3]),
  53  		}
  54  		if len(fields) > 4 {
  55  			seg.ASTDump = string(fields[4])
  56  		}
  57  		if len(fields) > 5 {
  58  			seg.IRFile = string(fields[5])
  59  		}
  60  		if len(fields) > 6 {
  61  			seg.ASMFile = string(fields[6])
  62  		}
  63  		if len(fields) > 7 {
  64  			seg.BINFile = string(fields[7])
  65  		}
  66  		if len(fields) > 8 {
  67  			seg.Lineinfo = string(fields[8])
  68  		}
  69  		segments = append(segments, seg)
  70  	}
  71  	return segments, nil
  72  }
  73  
  74  // LoadCorpus reads a manifest.csv and all referenced segment files into
  75  // an in-memory iskradb tree. Call StorageFlush to persist.
  76  func LoadCorpus(corpusDir string) (*Tree, error) {
  77  	db := lattice.NewTree(256)
  78  	t := NewTree(db)
  79  	if err := LoadCorpusInto(corpusDir, t); err != nil {
  80  		return nil, err
  81  	}
  82  	FinalizeCorpus(t)
  83  	return t, nil
  84  }
  85  
  86  // LoadCorpusStages loads only the specified stages from a corpus.
  87  // stages is a bitmask: (1<<StageSRC) | (1<<StageAST) etc.
  88  func LoadCorpusStages(corpusDir string, t *Tree, stages uint8) error {
  89  	return loadCorpusFiltered(corpusDir, t, stages)
  90  }
  91  
  92  // LoadCorpusInto loads a corpus into an existing tree.
  93  func LoadCorpusInto(corpusDir string, t *Tree) error {
  94  	return loadCorpusFiltered(corpusDir, t, 0xFF)
  95  }
  96  
  97  func loadCorpusFiltered(corpusDir string, t *Tree, stages uint8) error {
  98  	manifestPath := filepath.Join(corpusDir, "manifest.csv")
  99  	segments, err := LoadManifest(manifestPath)
 100  	if err != nil {
 101  		return err
 102  	}
 103  
 104  	// Build a lookup of segment files in .segments/ subdirs.
 105  	segmentPaths := map[string]string{}
 106  	dirEntries, _ := os.ReadDir(corpusDir)
 107  	for _, de := range dirEntries {
 108  		if de.IsDir() && bytes.HasSuffix([]byte(de.Name()), []byte(".segments")) {
 109  			subDir := filepath.Join(corpusDir, de.Name())
 110  			subEntries, _ := os.ReadDir(subDir)
 111  			for _, se := range subEntries {
 112  				segmentPaths[se.Name()] = filepath.Join(subDir, se.Name())
 113  			}
 114  		}
 115  	}
 116  
 117  	for _, seg := range segments {
 118  		kind := parseKind(seg.Kind)
 119  
 120  		if stages&(1<<StageSRC) != 0 && seg.ASTFile != "" {
 121  			astPath := segmentPaths[seg.ASTFile]
 122  			if astPath == "" {
 123  				astPath = filepath.Join(corpusDir, seg.ASTFile)
 124  			}
 125  			if astData, err := os.ReadFile(astPath); err == nil {
 126  				InsertSegment(t, StageSRC, kind, seg.Name, astData)
 127  			}
 128  		}
 129  
 130  		if stages&(1<<StageAST) != 0 && seg.ASTDump != "" {
 131  			astPath := filepath.Join(corpusDir, "ast", seg.ASTDump)
 132  			if astData, err := os.ReadFile(astPath); err == nil {
 133  				InsertSegment(t, StageAST, kind, seg.Name, astData)
 134  			}
 135  		}
 136  
 137  		if stages&(1<<StageIR) != 0 && seg.IRFile != "" {
 138  			irPath := filepath.Join(corpusDir, "ir", seg.IRFile)
 139  			if irData, err := os.ReadFile(irPath); err == nil {
 140  				InsertSegment(t, StageIR, kind, seg.Name, irData)
 141  			}
 142  		}
 143  
 144  		if stages&(1<<StageASM) != 0 && seg.ASMFile != "" {
 145  			asmPath := filepath.Join(corpusDir, "asm", seg.ASMFile)
 146  			if asmData, err := os.ReadFile(asmPath); err == nil {
 147  				InsertSegment(t, StageASM, kind, seg.Name, asmData)
 148  			}
 149  		}
 150  
 151  		if stages&(1<<StageBIN) != 0 && seg.BINFile != "" {
 152  			binPath := filepath.Join(corpusDir, "bin", seg.BINFile)
 153  			if binData, err := os.ReadFile(binPath); err == nil {
 154  				InsertSegment(t, StageBIN, kind, seg.Name, binData)
 155  			}
 156  		}
 157  	}
 158  
 159  	// Load full module IR when IR stage is included.
 160  	modPath := filepath.Join(corpusDir, "module.O0.ll")
 161  	if stages&(1<<StageIR) != 0 {
 162  		if modData, err := os.ReadFile(modPath); err == nil && len(modData) > 0 {
 163  			modName := "__module__"
 164  			if len(segments) > 0 {
 165  				modName = segments[0].Name | ".__module__"
 166  			}
 167  			InsertSegment(t, StageIR, KindPkg, modName, modData)
 168  		}
 169  	}
 170  
 171  	return nil
 172  }
 173  
 174  // FinalizeCorpus wires cross-branch links after all insertions are complete.
 175  func FinalizeCorpus(t *Tree) {
 176  	t.FinalizeLinks()
 177  }
 178  
 179  // InsertSegment inserts a single segment into the tree.
 180  // Returns the record index (recIdx) for further operations.
 181  func InsertSegment(t *Tree, stage uint8, kind NodeKind, name string, content []byte) uint32 {
 182  	// Use name-based key so cross-stage lookup (StageLinksFor) can find the
 183  	// same named entity at different stages by same hash + different stage prefix.
 184  	key := NameKey(stage, name)
 185  	branch := KindToBranch(kind)
 186  	recIdx := t.Insert(branch, key, name, kind, stage)
 187  
 188  	t.SetContent(recIdx, content)
 189  
 190  	if stage == StageAST {
 191  		st := ExtractSymbols(string(content))
 192  		sig := SignatureWithTypes(st)
 193  		t.RecMeta[recIdx].SetSigHash(SignatureHash24(sig))
 194  	}
 195  
 196  	return recIdx
 197  }
 198  
 199  func parseKind(s string) NodeKind {
 200  	switch s {
 201  	case "pkg":
 202  		return KindPkg
 203  	case "import":
 204  		return KindImport
 205  	case "const":
 206  		return KindConst
 207  	case "var":
 208  		return KindVar
 209  	case "type":
 210  		return KindType
 211  	case "func":
 212  		return KindFunc
 213  	case "method":
 214  		return KindMethod
 215  	default:
 216  		return KindUnknown
 217  	}
 218  }
 219  
 220  func splitLines(data []byte) [][]byte {
 221  	var lines [][]byte
 222  	for len(data) > 0 {
 223  		idx := bytes.IndexByte(data, '\n')
 224  		if idx < 0 {
 225  			if len(data) > 0 {
 226  				lines = append(lines, data)
 227  			}
 228  			break
 229  		}
 230  		line := data[:idx]
 231  		if len(line) > 0 && line[len(line)-1] == '\r' {
 232  			line = line[:len(line)-1]
 233  		}
 234  		lines = append(lines, line)
 235  		data = data[idx+1:]
 236  	}
 237  	return lines
 238  }
 239  
 240  func splitTabs(line []byte) [][]byte {
 241  	var fields [][]byte
 242  	for len(line) > 0 {
 243  		idx := bytes.IndexByte(line, '\t')
 244  		if idx < 0 {
 245  			fields = append(fields, line)
 246  			break
 247  		}
 248  		fields = append(fields, line[:idx])
 249  		line = line[idx+1:]
 250  	}
 251  	return fields
 252  }
 253