Files
vim-ale/internal/classify/classify.go
AJ ONeal 28dab7dade feat: complete classification of all 116 packages (169,867 rows)
- Add asset_filter/asset_exclude conf keys for shared-repo packages
- Split hugo/hugo-extended: exclude/require "extended" in asset name
- Add macosx, ia32, .snap, .appx classifier patterns
- Fix zig Platform.Size JSON string type (was int64, upstream sends string)
- Filter install scripts, cosign keys, compat.json as meta-assets
- Add riscv64, loong64, armv5, mipsle, mips64le to buildmeta

Full classification produces 169,867 distributable rows across 116 packages.
2026-03-10 00:27:57 -06:00

207 lines
7.2 KiB
Go

// Package classify extracts build targets from release asset filenames.
//
// Standard toolchains (goreleaser, cargo-dist, zig build) produce predictable
// filenames like "tool_0.1.0_linux_amd64.tar.gz" or
// "tool-0.1.0-x86_64-unknown-linux-musl.tar.gz". This package matches those
// patterns directly using regex, avoiding heuristic guessing.
//
// Detection order matters: architectures are checked longest-first to prevent
// "x86" from matching inside "x86_64", and OS checks use word boundaries.
package classify
import (
"path"
"regexp"
"strings"
"github.com/webinstall/webi-installers/internal/buildmeta"
)
// Result holds the classification of an asset filename.
type Result struct {
OS buildmeta.OS
Arch buildmeta.Arch
Libc buildmeta.Libc
Format buildmeta.Format
}
// Target returns the build target (OS + Arch + Libc).
func (r Result) Target() buildmeta.Target {
return buildmeta.Target{OS: r.OS, Arch: r.Arch, Libc: r.Libc}
}
// Filename classifies a release asset filename, returning the detected
// OS, architecture, libc, and archive format. Undetected fields are empty.
//
// OS is detected first because it can influence arch interpretation.
// For example, "windows-arm" in modern releases means ARM64, while
// bare "arm" on Linux historically means ARMv6.
func Filename(name string) Result {
lower := strings.ToLower(name)
os := detectOS(lower)
arch := detectArch(lower)
// On Windows, bare "arm" (detected as ARMv6) almost certainly means
// ARM64. Windows never shipped ARMv6 binaries — "ARM" became the
// marketing label for ARM64 (Windows on ARM).
if os == buildmeta.OSWindows && arch == buildmeta.ArchARMv6 {
arch = buildmeta.ArchARM64
}
format := detectFormat(lower)
// .deb, .rpm, .snap are Linux-only package formats.
if os == "" && (format == buildmeta.FormatDeb || format == buildmeta.FormatRPM || format == buildmeta.FormatSnap) {
os = buildmeta.OSLinux
}
// .app.zip and .dmg are macOS-only formats.
if os == "" && (format == buildmeta.FormatAppZip || format == buildmeta.FormatDMG) {
os = buildmeta.OSDarwin
}
return Result{
OS: os,
Arch: arch,
Libc: detectLibc(lower),
Format: format,
}
}
// b is a boundary: start/end of string or a non-alphanumeric separator.
// Go's RE2 doesn't support \b, so we use this instead.
const b = `(?:^|[^a-zA-Z0-9])`
const bEnd = `(?:[^a-zA-Z0-9]|$)`
// --- OS detection ---
var osPatterns = []struct {
os buildmeta.OS
pattern *regexp.Regexp
}{
{buildmeta.OSDarwin, regexp.MustCompile(`(?i)(?:` + b + `(?:darwin|macos|macosx|osx|os-x|apple)` + bEnd + `|` + b + `mac` + bEnd + `)`)},
{buildmeta.OSLinux, regexp.MustCompile(`(?i)` + b + `linux` + bEnd)},
{buildmeta.OSWindows, regexp.MustCompile(`(?i)` + b + `(?:windows|win(?:32|64|dows)?)` + bEnd + `|\.exe(?:\.xz)?$|\.msi$`)},
{buildmeta.OSFreeBSD, regexp.MustCompile(`(?i)` + b + `freebsd` + bEnd)},
{buildmeta.OSOpenBSD, regexp.MustCompile(`(?i)` + b + `openbsd` + bEnd)},
{buildmeta.OSNetBSD, regexp.MustCompile(`(?i)` + b + `netbsd` + bEnd)},
{buildmeta.OSDragonFly, regexp.MustCompile(`(?i)` + b + `dragonfly(?:bsd)?` + bEnd)},
{buildmeta.OSSunOS, regexp.MustCompile(`(?i)` + b + `(?:sunos|solaris|illumos)` + bEnd)},
{buildmeta.OSAIX, regexp.MustCompile(`(?i)` + b + `aix` + bEnd)},
{buildmeta.OSAndroid, regexp.MustCompile(`(?i)` + b + `android` + bEnd)},
{buildmeta.OSPlan9, regexp.MustCompile(`(?i)` + b + `plan9` + bEnd)},
}
func detectOS(lower string) buildmeta.OS {
for _, p := range osPatterns {
if p.pattern.MatchString(lower) {
return p.os
}
}
return ""
}
// --- Arch detection ---
// Order matters: check longer/more-specific patterns first.
var archPatterns = []struct {
arch buildmeta.Arch
pattern *regexp.Regexp
}{
// Universal/fat binaries before specific arches.
{buildmeta.ArchUniversal2, regexp.MustCompile(`(?i)` + b + `(?:universal2?|fat)` + bEnd)},
// amd64 micro-levels before baseline — "amd64v3" must not fall through to amd64.
{buildmeta.ArchAMD64v4, regexp.MustCompile(`(?i)(?:x86[_-]64[_-]v4|amd64v4|v4-amd64)`)},
{buildmeta.ArchAMD64v3, regexp.MustCompile(`(?i)(?:x86[_-]64[_-]v3|amd64v3|v3-amd64)`)},
{buildmeta.ArchAMD64v2, regexp.MustCompile(`(?i)(?:x86[_-]64[_-]v2|amd64v2|v2-amd64)`)},
// amd64 baseline before x86 — "x86_64" must not match as x86.
{buildmeta.ArchAMD64, regexp.MustCompile(`(?i)(?:x86[_-]64|amd64|x64|64-?bit)`)},
// arm64 before armv7/armv6 — "aarch64" must not match as arm.
{buildmeta.ArchARM64, regexp.MustCompile(`(?i)(?:aarch64|arm64|armv8)`)},
{buildmeta.ArchARMv7, regexp.MustCompile(`(?i)(?:armv7l?|arm-?v7|arm7|arm32|armhf)`)},
{buildmeta.ArchARMv6, regexp.MustCompile(`(?i)(?:armv6l?|arm-?v6|aarch32|` + b + `arm` + bEnd + `)`)},
{buildmeta.ArchARMv5, regexp.MustCompile(`(?i)(?:armv5)`)},
// ppc64le before ppc64.
{buildmeta.ArchPPC64LE, regexp.MustCompile(`(?i)ppc64le`)},
{buildmeta.ArchPPC64, regexp.MustCompile(`(?i)ppc64`)},
{buildmeta.ArchRISCV64, regexp.MustCompile(`(?i)riscv64`)},
{buildmeta.ArchS390X, regexp.MustCompile(`(?i)s390x`)},
{buildmeta.ArchLoong64, regexp.MustCompile(`(?i)loong(?:arch)?64`)},
{buildmeta.ArchMIPS64LE, regexp.MustCompile(`(?i)mips64(?:el|le)`)},
{buildmeta.ArchMIPS64, regexp.MustCompile(`(?i)mips64`)},
{buildmeta.ArchMIPSLE, regexp.MustCompile(`(?i)mips(?:el|le)`)},
{buildmeta.ArchMIPS, regexp.MustCompile(`(?i)` + b + `mips` + bEnd)},
// x86 last — must not steal x86_64.
{buildmeta.ArchX86, regexp.MustCompile(`(?i)(?:` + b + `x86` + bEnd + `|i[3-6]86|ia32|` + b + `386` + bEnd + `|32-?bit)`)},
}
func detectArch(lower string) buildmeta.Arch {
for _, p := range archPatterns {
if p.pattern.MatchString(lower) {
return p.arch
}
}
return ""
}
// --- Libc detection ---
var (
reMusl = regexp.MustCompile(`(?i)` + b + `musl` + bEnd)
reGNU = regexp.MustCompile(`(?i)` + b + `(?:gnu|glibc)` + bEnd)
reMSVC = regexp.MustCompile(`(?i)` + b + `msvc` + bEnd)
reStatic = regexp.MustCompile(`(?i)` + b + `static` + bEnd)
)
func detectLibc(lower string) buildmeta.Libc {
switch {
case reMusl.MatchString(lower):
return buildmeta.LibcMusl
case reGNU.MatchString(lower):
return buildmeta.LibcGNU
case reMSVC.MatchString(lower):
return buildmeta.LibcMSVC
case reStatic.MatchString(lower):
return buildmeta.LibcNone
}
return ""
}
// --- Format detection ---
// formatSuffixes maps file extensions to formats, longest first.
var formatSuffixes = []struct {
suffix string
format buildmeta.Format
}{
{".tar.gz", buildmeta.FormatTarGz},
{".tar.xz", buildmeta.FormatTarXz},
{".tar.zst", buildmeta.FormatTarZst},
{".exe.xz", buildmeta.FormatExeXz},
{".app.zip", buildmeta.FormatAppZip},
{".tgz", buildmeta.FormatTarGz},
{".zip", buildmeta.FormatZip},
{".gz", buildmeta.FormatGz},
{".xz", buildmeta.FormatXz},
{".zst", buildmeta.FormatZst},
{".7z", buildmeta.Format7z},
{".exe", buildmeta.FormatExe},
{".msi", buildmeta.FormatMSI},
{".dmg", buildmeta.FormatDMG},
{".deb", buildmeta.FormatDeb},
{".rpm", buildmeta.FormatRPM},
{".snap", buildmeta.FormatSnap},
{".appx", buildmeta.FormatAppx},
{".pkg", buildmeta.FormatPkg},
}
func detectFormat(lower string) buildmeta.Format {
// Use the base name to avoid directory separators confusing suffix matching.
base := path.Base(lower)
for _, s := range formatSuffixes {
if strings.HasSuffix(base, s.suffix) {
return s.format
}
}
return ""
}