main.go raw
1 // mxcorpus generates a paired AST/IR/ASM/BIN corpus from Moxie source packages.
2 //
3 // For each package, it produces:
4 // - Source segments (.mx) extracted from original source by declaration
5 // - LLVM IR segments (.ll) at four optimization levels (O0, O1, O2, Os)
6 // - Assembly segments (.s) at four optimization levels
7 // - Binary segments (.bin.hex) at four optimization levels
8 // - DWARF line info (.lineinfo) mapping binary offsets to source lines
9 // - A manifest.json mapping all segments with positions and metadata
10 //
11 // Usage:
12 //
13 // mxcorpus -o <outdir> [-verify] <directory...> # AST only
14 // mxcorpus -o <outdir> -ir <import-path> # AST + IR
15 package main
16
17 import (
18 "encoding/json"
19 "flag"
20 "fmt"
21 "go/parser"
22 "go/token"
23 "os"
24 "path/filepath"
25 "sort"
26 "strings"
27
28 "moxie/mxtext"
29 )
30
31 var (
32 flagVerify bool
33 flagIR string
34 flagModDir string
35 )
36
37 func main() {
38 outdir := flag.String("o", "corpus", "output directory")
39 flag.BoolVar(&flagVerify, "verify", false, "verify segment concatenation")
40 flag.StringVar(&flagIR, "ir", "", "generate IR for this import path (e.g. unicode/utf8)")
41 flag.StringVar(&flagModDir, "moddir", "", "module root directory for non-stdlib packages")
42 flag.Parse()
43
44 // AST extraction from directories.
45 var pkgDirs []string
46 for _, dir := range flag.Args() {
47 dirs, err := processDir(dir, *outdir)
48 if err != nil {
49 fmt.Fprintf(os.Stderr, "%s: %v\n", dir, err)
50 os.Exit(1)
51 }
52 pkgDirs = append(pkgDirs, dirs...)
53 }
54
55 // IR + ASM + BIN + lineinfo + pairing from import path.
56 if flagIR != "" {
57 pkgDir := ""
58 irBase := filepath.Base(flagIR)
59 for _, d := range pkgDirs {
60 if filepath.Base(d) == irBase {
61 pkgDir = d
62 break
63 }
64 }
65 if pkgDir == "" {
66 if len(pkgDirs) == 1 {
67 pkgDir = pkgDirs[0]
68 } else {
69 fmt.Fprintf(os.Stderr, "ir: cannot determine package directory for %q among %v\n", flagIR, pkgDirs)
70 os.Exit(1)
71 }
72 }
73 if err := extractIR(flagIR, pkgDir); err != nil {
74 fmt.Fprintf(os.Stderr, "ir: %v\n", err)
75 os.Exit(1)
76 }
77 if err := extractASM(pkgDir); err != nil {
78 fmt.Fprintf(os.Stderr, "asm: %v\n", err)
79 os.Exit(1)
80 }
81 if err := extractBIN(pkgDir); err != nil {
82 fmt.Fprintf(os.Stderr, "bin: %v (non-fatal)\n", err)
83 }
84 if err := extractLineInfo(pkgDir); err != nil {
85 fmt.Fprintf(os.Stderr, "lineinfo: %v (non-fatal)\n", err)
86 }
87 if err := enrichManifest(flagIR, pkgDir); err != nil {
88 fmt.Fprintf(os.Stderr, "pair: %v\n", err)
89 os.Exit(1)
90 }
91 }
92 }
93
94 func processDir(dir, outdir string) ([]string, error) {
95 entries, err := os.ReadDir(dir)
96 if err != nil {
97 return nil, err
98 }
99
100 fset := token.NewFileSet()
101 pkgFiles := map[string][]parsedFile{}
102
103 for _, e := range entries {
104 if e.IsDir() || !strings.HasSuffix(e.Name(), ".mx") {
105 continue
106 }
107 if strings.HasSuffix(e.Name(), "_test.mx") {
108 continue
109 }
110
111 path := filepath.Join(dir, e.Name())
112 src, err := os.ReadFile(path)
113 if err != nil {
114 return nil, fmt.Errorf("read %s: %w", path, err)
115 }
116
117 origSrc := append([]byte(nil), src...)
118
119 chanRes := mxtext.RewriteChanLiterals(src, fset)
120 sliceRes := mxtext.RewriteSliceLiterals(chanRes.Src, fset)
121 rewritten := sliceRes.Src
122
123 f, err := parser.ParseFile(fset, e.Name(), rewritten, parser.ParseComments)
124 if err != nil {
125 return nil, fmt.Errorf("parse %s: %w", path, err)
126 }
127
128 pf := parsedFile{
129 name: e.Name(),
130 origSrc: origSrc,
131 src: rewritten,
132 file: f,
133 }
134 pkgFiles[f.Name.Name] = append(pkgFiles[f.Name.Name], pf)
135 }
136
137 dirBase := filepath.Base(dir)
138 var result []string
139
140 for pkgName, files := range pkgFiles {
141 outName := dirBase
142 if pkgName != dirBase && pkgName != "main" {
143 outName = pkgName
144 } else if pkgName == "main" && len(pkgFiles) > 1 {
145 outName = dirBase + "_gen"
146 }
147
148 pkgDir := filepath.Join(outdir, outName)
149 manifest := &PackageManifest{
150 Package: pkgName,
151 }
152
153 for fileIdx, pf := range files {
154 fm, err := extractAST(fset, pf, fileIdx+1, pkgDir)
155 if err != nil {
156 return nil, fmt.Errorf("ast %s: %w", pf.name, err)
157 }
158 manifest.Files = append(manifest.Files, fm)
159 }
160
161 manifestPath := filepath.Join(pkgDir, "manifest.json")
162 data, err := json.MarshalIndent(manifest, "", " ")
163 if err != nil {
164 return nil, err
165 }
166 if err := os.WriteFile(manifestPath, data, 0644); err != nil {
167 return nil, err
168 }
169
170 n := 0
171 for _, f := range manifest.Files {
172 n += len(f.Segments)
173 }
174 fmt.Printf("%s: %d files, %d segments\n", outName, len(manifest.Files), n)
175
176 if flagVerify {
177 if err := verifyConcat(files, pkgDir); err != nil {
178 return nil, err
179 }
180 }
181 result = append(result, pkgDir)
182 }
183
184 return result, nil
185 }
186
187 func verifyConcat(files []parsedFile, pkgDir string) error {
188 for _, pf := range files {
189 segDir := filepath.Join(pkgDir, pf.name+".segments")
190 entries, err := os.ReadDir(segDir)
191 if err != nil {
192 return fmt.Errorf("verify %s: %w", pf.name, err)
193 }
194
195 var names []string
196 for _, e := range entries {
197 if strings.HasSuffix(e.Name(), ".mx") {
198 names = append(names, e.Name())
199 }
200 }
201 sort.Strings(names)
202
203 var concat []byte
204 for _, n := range names {
205 data, err := os.ReadFile(filepath.Join(segDir, n))
206 if err != nil {
207 return err
208 }
209 concat = append(concat, data...)
210 }
211
212 if string(concat) != string(pf.src) {
213 return fmt.Errorf("verify %s: segment concat (%d bytes) != rewritten source (%d bytes)", pf.name, len(concat), len(pf.src))
214 }
215 fmt.Printf(" verify %s: PASS (%d bytes)\n", pf.name, len(pf.src))
216 }
217 return nil
218 }
219