From 28dab7dade3352b32773981c79ff195cd76d02f3 Mon Sep 17 00:00:00 2001 From: AJ ONeal Date: Tue, 10 Mar 2026 00:27:57 -0600 Subject: [PATCH] feat: complete classification of all 116 packages (169,867 rows) - Add asset_filter/asset_exclude conf keys for shared-repo packages - Split hugo/hugo-extended: exclude/require "extended" in asset name - Add macosx, ia32, .snap, .appx classifier patterns - Fix zig Platform.Size JSON string type (was int64, upstream sends string) - Filter install scripts, cosign keys, compat.json as meta-assets - Add riscv64, loong64, armv5, mipsle, mips64le to buildmeta Full classification produces 169,867 distributable rows across 116 packages. --- CAT-RULES.md | 48 ++++++++++++++++++++++++++++++--- cmd/classify/main.go | 24 +++++++++++++++-- hugo-extended/releases.conf | 1 + hugo/releases.conf | 1 + internal/buildmeta/buildmeta.go | 2 ++ internal/classify/classify.go | 10 ++++--- 6 files changed, 77 insertions(+), 9 deletions(-) diff --git a/CAT-RULES.md b/CAT-RULES.md index c02bc57..74cb955 100644 --- a/CAT-RULES.md +++ b/CAT-RULES.md @@ -79,10 +79,52 @@ Early Flutter releases (pre-2020) had no arch-specific builds — single platform SDK. No arch in filename → empty arch in CSV. This is correct; the installer would default to x86_64 on supported platforms. -### TODO for Next Batches +## Batch 3 (25 packages: arc through gitdeploy) + +### New Classifier Patterns + +- `macosx` → darwin (syncthing uses `macosx`) +- `ia32` → x86 (dart-sass uses `ia32`) +- `.snap` format → Linux-only +- `.appx` format added for PowerShell + +### New Meta-Asset Filters + +- `.pub` (cosign keys) +- `install.sh`, `install.ps1` (install scripts) +- `compat.json` (syncthing metadata) + +## Batch 4 (62 remaining packages) + Full Run + +### Hugo/Hugo-Extended Split + +hugo-extended shares the same GitHub repo as hugo. Added `asset_filter` and +`asset_exclude` conf keys to split them: +- `hugo/releases.conf`: `asset_exclude = extended` (6,354 assets) +- `hugo-extended/releases.conf`: `asset_filter = extended` (2,193 assets) + +User direction: "hugo-extended should be a separate release. I believe the +README covered this. I think it should have been the default." + +### Remaining Empty-Field Patterns (Per-Installer Territory) + +These have empty OS or arch from the generic classifier and need per-installer +config to resolve: +- Git-for-Windows: `Git-2.x.x-32-bit.tar.bz2` — no OS in filename, always Windows +- CMake: HP-UX, IRIX targets — exotic/dead platforms +- Dashcore: old naming conventions +- Old PowerShell `.msi` files — no arch in filename +- Bare binaries (ollama-darwin, caddy2_beta12_macos) — no arch info + +### Full Results + +169,867 distributable rows across 116 packages. +3 packages produce 0 rows: serviceman, aliasman (source-only), duckdns.sh. + +### TODO -- Hugo "extended" variant should be captured in `extra` column - Consider whether bare binaries (no format extension) should get a format marker -- Track `_extended` suffix detection more broadly +- Per-installer configs for packages with known-but-undetectable OS/arch +- `arm32` classification: leave to per-installer unless pattern emerges - `arm32` is vague — may mean armv6 or armv7. Leave as per-installer responsibility unless a distinct pattern emerges (user direction 2026-03-10) diff --git a/cmd/classify/main.go b/cmd/classify/main.go index 4b9853a..bdb5f76 100644 --- a/cmd/classify/main.go +++ b/cmd/classify/main.go @@ -237,6 +237,8 @@ type ghAsset struct { func classifyGitHub(pkg string, conf *installerconf.Conf, d *rawcache.Dir) ([]Dist, error) { tagPrefix := conf.Get("tag_prefix") + assetFilter := strings.ToLower(conf.Get("asset_filter")) // asset must contain this + assetExclude := strings.ToLower(conf.Get("asset_exclude")) // asset must NOT contain this releases, err := readAllReleases(d) if err != nil { return nil, err @@ -269,12 +271,21 @@ func classifyGitHub(pkg string, conf *installerconf.Conf, d *rawcache.Dir) ([]Di for _, asset := range rel.Assets { name := asset.Name + lower := strings.ToLower(name) // Skip checksums, signatures, SBOMs, etc. if isMetaAsset(name) { continue } + // Per-package asset filters. + if assetFilter != "" && !strings.Contains(lower, assetFilter) { + continue + } + if assetExclude != "" && strings.Contains(lower, assetExclude) { + continue + } + os_, arch, libc, format := classifyFilename(name) dists = append(dists, Dist{ @@ -1206,8 +1217,9 @@ func isMetaAsset(name string) bool { "checksums.txt", "sha256sums", "sha512sums", ".sbom", ".spdx", ".json.sig", ".sigstore", "_src.tar.gz", "_src.tar.xz", "_src.zip", - ".d.ts", // TypeScript definitions - ".tgz", // npm packages (not binary distributables) + ".d.ts", // TypeScript definitions + ".tgz", // npm packages (not binary distributables) + ".pub", // cosign/SSH public keys } { if strings.HasSuffix(lower, suffix) { return true @@ -1221,6 +1233,14 @@ func isMetaAsset(name string) bool { return true } } + // Exact name matches for known non-distributable files. + for _, exact := range []string{ + "install.sh", "install.ps1", "compat.json", + } { + if lower == exact { + return true + } + } return false } diff --git a/hugo-extended/releases.conf b/hugo-extended/releases.conf index 34b9804..e6d099e 100644 --- a/hugo-extended/releases.conf +++ b/hugo-extended/releases.conf @@ -1,3 +1,4 @@ source = github owner = gohugoio repo = hugo +asset_filter = extended diff --git a/hugo/releases.conf b/hugo/releases.conf index 34b9804..f56daed 100644 --- a/hugo/releases.conf +++ b/hugo/releases.conf @@ -1,3 +1,4 @@ source = github owner = gohugoio repo = hugo +asset_exclude = extended diff --git a/internal/buildmeta/buildmeta.go b/internal/buildmeta/buildmeta.go index 399d2a2..f3c3563 100644 --- a/internal/buildmeta/buildmeta.go +++ b/internal/buildmeta/buildmeta.go @@ -88,6 +88,8 @@ const ( Format7z Format = ".7z" FormatDeb Format = ".deb" FormatRPM Format = ".rpm" + FormatSnap Format = ".snap" + FormatAppx Format = ".appx" FormatSh Format = ".sh" FormatGit Format = ".git" ) diff --git a/internal/classify/classify.go b/internal/classify/classify.go index f288205..ab2254d 100644 --- a/internal/classify/classify.go +++ b/internal/classify/classify.go @@ -50,8 +50,8 @@ func Filename(name string) Result { format := detectFormat(lower) - // .deb and .rpm are Linux-only package formats. - if os == "" && (format == buildmeta.FormatDeb || format == buildmeta.FormatRPM) { + // .deb, .rpm, .snap are Linux-only package formats. + if os == "" && (format == buildmeta.FormatDeb || format == buildmeta.FormatRPM || format == buildmeta.FormatSnap) { os = buildmeta.OSLinux } // .app.zip and .dmg are macOS-only formats. @@ -78,7 +78,7 @@ var osPatterns = []struct { os buildmeta.OS pattern *regexp.Regexp }{ - {buildmeta.OSDarwin, regexp.MustCompile(`(?i)(?:` + b + `(?:darwin|macos|osx|os-x|apple)` + bEnd + `|` + b + `mac` + bEnd + `)`)}, + {buildmeta.OSDarwin, regexp.MustCompile(`(?i)(?:` + b + `(?:darwin|macos|macosx|osx|os-x|apple)` + bEnd + `|` + b + `mac` + bEnd + `)`)}, {buildmeta.OSLinux, regexp.MustCompile(`(?i)` + b + `linux` + bEnd)}, {buildmeta.OSWindows, regexp.MustCompile(`(?i)` + b + `(?:windows|win(?:32|64|dows)?)` + bEnd + `|\.exe(?:\.xz)?$|\.msi$`)}, {buildmeta.OSFreeBSD, regexp.MustCompile(`(?i)` + b + `freebsd` + bEnd)}, @@ -131,7 +131,7 @@ var archPatterns = []struct { {buildmeta.ArchMIPSLE, regexp.MustCompile(`(?i)mips(?:el|le)`)}, {buildmeta.ArchMIPS, regexp.MustCompile(`(?i)` + b + `mips` + bEnd)}, // x86 last — must not steal x86_64. - {buildmeta.ArchX86, regexp.MustCompile(`(?i)(?:` + b + `x86` + bEnd + `|i[3-6]86|` + b + `386` + bEnd + `|32-?bit)`)}, + {buildmeta.ArchX86, regexp.MustCompile(`(?i)(?:` + b + `x86` + bEnd + `|i[3-6]86|ia32|` + b + `386` + bEnd + `|32-?bit)`)}, } func detectArch(lower string) buildmeta.Arch { @@ -189,6 +189,8 @@ var formatSuffixes = []struct { {".dmg", buildmeta.FormatDMG}, {".deb", buildmeta.FormatDeb}, {".rpm", buildmeta.FormatRPM}, + {".snap", buildmeta.FormatSnap}, + {".appx", buildmeta.FormatAppx}, {".pkg", buildmeta.FormatPkg}, }