mirror of
https://github.com/stakater/Reloader.git
synced 2026-02-14 18:09:50 +00:00
feat: Load tests
This commit is contained in:
222
.github/workflows/loadtest.yml
vendored
Normal file
222
.github/workflows/loadtest.yml
vendored
Normal file
@@ -0,0 +1,222 @@
|
||||
name: Load Test
|
||||
|
||||
on:
|
||||
issue_comment:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
loadtest:
|
||||
# Only run on PR comments with /loadtest command
|
||||
if: |
|
||||
github.event.issue.pull_request &&
|
||||
contains(github.event.comment.body, '/loadtest')
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Add reaction to comment
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
await github.rest.reactions.createForIssueComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: context.payload.comment.id,
|
||||
content: 'rocket'
|
||||
});
|
||||
|
||||
- name: Get PR details
|
||||
id: pr
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const pr = await github.rest.pulls.get({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: context.issue.number
|
||||
});
|
||||
core.setOutput('head_ref', pr.data.head.ref);
|
||||
core.setOutput('head_sha', pr.data.head.sha);
|
||||
core.setOutput('base_ref', pr.data.base.ref);
|
||||
core.setOutput('base_sha', pr.data.base.sha);
|
||||
console.log(`PR #${context.issue.number}: ${pr.data.head.ref} -> ${pr.data.base.ref}`);
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.23'
|
||||
cache: false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Install kind
|
||||
run: |
|
||||
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
|
||||
chmod +x ./kind
|
||||
sudo mv ./kind /usr/local/bin/kind
|
||||
|
||||
# Build OLD image from base branch (e.g., main)
|
||||
- name: Checkout base branch (old)
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ steps.pr.outputs.base_ref }}
|
||||
path: old
|
||||
|
||||
- name: Build old image
|
||||
run: |
|
||||
cd old
|
||||
docker build -t localhost/reloader:old -f Dockerfile .
|
||||
echo "Built old image from ${{ steps.pr.outputs.base_ref }} (${{ steps.pr.outputs.base_sha }})"
|
||||
|
||||
# Build NEW image from PR branch
|
||||
- name: Checkout PR branch (new)
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ steps.pr.outputs.head_ref }}
|
||||
path: new
|
||||
|
||||
- name: Build new image
|
||||
run: |
|
||||
cd new
|
||||
docker build -t localhost/reloader:new -f Dockerfile .
|
||||
echo "Built new image from ${{ steps.pr.outputs.head_ref }} (${{ steps.pr.outputs.head_sha }})"
|
||||
|
||||
# Build and run loadtest from PR branch
|
||||
- name: Build loadtest tool
|
||||
run: |
|
||||
cd new/test/loadtest
|
||||
go build -o loadtest ./cmd/loadtest
|
||||
|
||||
- name: Run A/B comparison load test
|
||||
id: loadtest
|
||||
run: |
|
||||
cd new/test/loadtest
|
||||
./loadtest run \
|
||||
--old-image=localhost/reloader:old \
|
||||
--new-image=localhost/reloader:new \
|
||||
--scenario=all \
|
||||
--duration=60 2>&1 | tee loadtest-output.txt
|
||||
echo "exitcode=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Upload results
|
||||
uses: actions/upload-artifact@v4
|
||||
if: always()
|
||||
with:
|
||||
name: loadtest-results
|
||||
path: |
|
||||
new/test/loadtest/results/
|
||||
new/test/loadtest/loadtest-output.txt
|
||||
retention-days: 30
|
||||
|
||||
- name: Post results comment
|
||||
uses: actions/github-script@v7
|
||||
if: always()
|
||||
with:
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
|
||||
let results = '';
|
||||
const resultsDir = 'new/test/loadtest/results';
|
||||
|
||||
// Collect summary of all scenarios
|
||||
let passCount = 0;
|
||||
let failCount = 0;
|
||||
const summaries = [];
|
||||
|
||||
if (fs.existsSync(resultsDir)) {
|
||||
const scenarios = fs.readdirSync(resultsDir).sort();
|
||||
for (const scenario of scenarios) {
|
||||
const reportPath = `${resultsDir}/${scenario}/report.txt`;
|
||||
if (fs.existsSync(reportPath)) {
|
||||
const report = fs.readFileSync(reportPath, 'utf8');
|
||||
|
||||
// Extract status from report
|
||||
const statusMatch = report.match(/Status:\s+(PASS|FAIL)/);
|
||||
const status = statusMatch ? statusMatch[1] : 'UNKNOWN';
|
||||
|
||||
if (status === 'PASS') passCount++;
|
||||
else failCount++;
|
||||
|
||||
// Extract key metrics for summary
|
||||
const actionMatch = report.match(/action_total\s+[\d.]+\s+[\d.]+\s+[\d.]+/);
|
||||
const errorsMatch = report.match(/errors_total\s+[\d.]+\s+[\d.]+/);
|
||||
|
||||
summaries.push(`| ${scenario} | ${status === 'PASS' ? '✅' : '❌'} ${status} |`);
|
||||
|
||||
results += `\n<details>\n<summary>${status === 'PASS' ? '✅' : '❌'} ${scenario}</summary>\n\n\`\`\`\n${report}\n\`\`\`\n</details>\n`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!results) {
|
||||
// Read raw output if no reports
|
||||
if (fs.existsSync('new/test/loadtest/loadtest-output.txt')) {
|
||||
const output = fs.readFileSync('new/test/loadtest/loadtest-output.txt', 'utf8');
|
||||
const maxLen = 60000;
|
||||
results = output.length > maxLen
|
||||
? output.substring(output.length - maxLen)
|
||||
: output;
|
||||
results = `\`\`\`\n${results}\n\`\`\``;
|
||||
} else {
|
||||
results = 'No results available';
|
||||
}
|
||||
}
|
||||
|
||||
const overallStatus = failCount === 0 ? '✅ ALL PASSED' : `❌ ${failCount} FAILED`;
|
||||
|
||||
const body = `## Load Test Results ${overallStatus}
|
||||
|
||||
**Comparing:** \`${{ steps.pr.outputs.base_ref }}\` (old) vs \`${{ steps.pr.outputs.head_ref }}\` (new)
|
||||
**Old commit:** ${{ steps.pr.outputs.base_sha }}
|
||||
**New commit:** ${{ steps.pr.outputs.head_sha }}
|
||||
**Triggered by:** @${{ github.event.comment.user.login }}
|
||||
|
||||
### Summary
|
||||
|
||||
| Scenario | Status |
|
||||
|----------|--------|
|
||||
${summaries.join('\n')}
|
||||
|
||||
**Total:** ${passCount} passed, ${failCount} failed
|
||||
|
||||
### Detailed Results
|
||||
|
||||
${results}
|
||||
|
||||
<details>
|
||||
<summary>📦 Download full results</summary>
|
||||
|
||||
Artifacts are available in the [workflow run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}).
|
||||
</details>
|
||||
`;
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: body
|
||||
});
|
||||
|
||||
- name: Add success reaction
|
||||
if: success()
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
await github.rest.reactions.createForIssueComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: context.payload.comment.id,
|
||||
content: '+1'
|
||||
});
|
||||
|
||||
- name: Add failure reaction
|
||||
if: failure()
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
await github.rest.reactions.createForIssueComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: context.payload.comment.id,
|
||||
content: '-1'
|
||||
});
|
||||
@@ -103,6 +103,8 @@ func NewController(
|
||||
|
||||
// Add function to add a new object to the queue in case of creating a resource
|
||||
func (c *Controller) Add(obj interface{}) {
|
||||
// Record event received
|
||||
c.collectors.RecordEventReceived("add", c.resource)
|
||||
|
||||
switch object := obj.(type) {
|
||||
case *v1.Namespace:
|
||||
@@ -112,11 +114,14 @@ func (c *Controller) Add(obj interface{}) {
|
||||
|
||||
if options.ReloadOnCreate == "true" {
|
||||
if !c.resourceInIgnoredNamespace(obj) && c.resourceInSelectedNamespaces(obj) && secretControllerInitialized && configmapControllerInitialized {
|
||||
c.queue.Add(handler.ResourceCreatedHandler{
|
||||
Resource: obj,
|
||||
Collectors: c.collectors,
|
||||
Recorder: c.recorder,
|
||||
c.enqueue(handler.ResourceCreatedHandler{
|
||||
Resource: obj,
|
||||
Collectors: c.collectors,
|
||||
Recorder: c.recorder,
|
||||
EnqueueTime: time.Now(), // Track when item was enqueued
|
||||
})
|
||||
} else {
|
||||
c.collectors.RecordSkipped("ignored_or_not_selected")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -166,31 +171,42 @@ func (c *Controller) removeSelectedNamespaceFromCache(namespace v1.Namespace) {
|
||||
|
||||
// Update function to add an old object and a new object to the queue in case of updating a resource
|
||||
func (c *Controller) Update(old interface{}, new interface{}) {
|
||||
// Record event received
|
||||
c.collectors.RecordEventReceived("update", c.resource)
|
||||
|
||||
switch new.(type) {
|
||||
case *v1.Namespace:
|
||||
return
|
||||
}
|
||||
|
||||
if !c.resourceInIgnoredNamespace(new) && c.resourceInSelectedNamespaces(new) {
|
||||
c.queue.Add(handler.ResourceUpdatedHandler{
|
||||
c.enqueue(handler.ResourceUpdatedHandler{
|
||||
Resource: new,
|
||||
OldResource: old,
|
||||
Collectors: c.collectors,
|
||||
Recorder: c.recorder,
|
||||
EnqueueTime: time.Now(), // Track when item was enqueued
|
||||
})
|
||||
} else {
|
||||
c.collectors.RecordSkipped("ignored_or_not_selected")
|
||||
}
|
||||
}
|
||||
|
||||
// Delete function to add an object to the queue in case of deleting a resource
|
||||
func (c *Controller) Delete(old interface{}) {
|
||||
// Record event received
|
||||
c.collectors.RecordEventReceived("delete", c.resource)
|
||||
|
||||
if options.ReloadOnDelete == "true" {
|
||||
if !c.resourceInIgnoredNamespace(old) && c.resourceInSelectedNamespaces(old) && secretControllerInitialized && configmapControllerInitialized {
|
||||
c.queue.Add(handler.ResourceDeleteHandler{
|
||||
Resource: old,
|
||||
Collectors: c.collectors,
|
||||
Recorder: c.recorder,
|
||||
c.enqueue(handler.ResourceDeleteHandler{
|
||||
Resource: old,
|
||||
Collectors: c.collectors,
|
||||
Recorder: c.recorder,
|
||||
EnqueueTime: time.Now(), // Track when item was enqueued
|
||||
})
|
||||
} else {
|
||||
c.collectors.RecordSkipped("ignored_or_not_selected")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,6 +217,13 @@ func (c *Controller) Delete(old interface{}) {
|
||||
}
|
||||
}
|
||||
|
||||
// enqueue adds an item to the queue and records metrics
|
||||
func (c *Controller) enqueue(item interface{}) {
|
||||
c.queue.Add(item)
|
||||
c.collectors.RecordQueueAdd()
|
||||
c.collectors.SetQueueDepth(c.queue.Len())
|
||||
}
|
||||
|
||||
// Run function for controller which handles the queue
|
||||
func (c *Controller) Run(threadiness int, stopCh chan struct{}) {
|
||||
defer runtime.HandleCrash()
|
||||
@@ -242,13 +265,36 @@ func (c *Controller) processNextItem() bool {
|
||||
if quit {
|
||||
return false
|
||||
}
|
||||
|
||||
// Update queue depth after getting item
|
||||
c.collectors.SetQueueDepth(c.queue.Len())
|
||||
|
||||
// Tell the queue that we are done with processing this key. This unblocks the key for other workers
|
||||
// This allows safe parallel processing because two events with the same key are never processed in
|
||||
// parallel.
|
||||
defer c.queue.Done(resourceHandler)
|
||||
|
||||
// Record queue latency if the handler supports it
|
||||
if h, ok := resourceHandler.(handler.TimedHandler); ok {
|
||||
queueLatency := time.Since(h.GetEnqueueTime())
|
||||
c.collectors.RecordQueueLatency(queueLatency)
|
||||
}
|
||||
|
||||
// Track reconcile/handler duration
|
||||
startTime := time.Now()
|
||||
|
||||
// Invoke the method containing the business logic
|
||||
err := resourceHandler.(handler.ResourceHandler).Handle()
|
||||
|
||||
duration := time.Since(startTime)
|
||||
|
||||
// Record reconcile metrics
|
||||
if err != nil {
|
||||
c.collectors.RecordReconcile("error", duration)
|
||||
} else {
|
||||
c.collectors.RecordReconcile("success", duration)
|
||||
}
|
||||
|
||||
// Handle the error if something went wrong during the execution of the business logic
|
||||
c.handleErr(err, resourceHandler)
|
||||
return true
|
||||
@@ -261,16 +307,26 @@ func (c *Controller) handleErr(err error, key interface{}) {
|
||||
// This ensures that future processing of updates for this key is not delayed because of
|
||||
// an outdated error history.
|
||||
c.queue.Forget(key)
|
||||
|
||||
// Record successful event processing
|
||||
c.collectors.RecordEventProcessed("unknown", c.resource, "success")
|
||||
return
|
||||
}
|
||||
|
||||
// Record error
|
||||
c.collectors.RecordError("handler_error")
|
||||
|
||||
// This controller retries 5 times if something goes wrong. After that, it stops trying.
|
||||
if c.queue.NumRequeues(key) < 5 {
|
||||
logrus.Errorf("Error syncing events: %v", err)
|
||||
|
||||
// Record retry
|
||||
c.collectors.RecordRetry()
|
||||
|
||||
// Re-enqueue the key rate limited. Based on the rate limiter on the
|
||||
// queue and the re-enqueue history, the key will be processed later again.
|
||||
c.queue.AddRateLimited(key)
|
||||
c.collectors.SetQueueDepth(c.queue.Len())
|
||||
return
|
||||
}
|
||||
|
||||
@@ -279,4 +335,7 @@ func (c *Controller) handleErr(err error, key interface{}) {
|
||||
runtime.HandleError(err)
|
||||
logrus.Errorf("Dropping key out of the queue: %v", err)
|
||||
logrus.Debugf("Dropping the key %q out of the queue: %v", key, err)
|
||||
|
||||
// Record failed event processing
|
||||
c.collectors.RecordEventProcessed("unknown", c.resource, "dropped")
|
||||
}
|
||||
|
||||
@@ -2157,19 +2157,21 @@ func TestController_resourceInIgnoredNamespace(t *testing.T) {
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
c := &Controller{
|
||||
client: tt.fields.client,
|
||||
indexer: tt.fields.indexer,
|
||||
queue: tt.fields.queue,
|
||||
informer: tt.fields.informer,
|
||||
namespace: tt.fields.namespace,
|
||||
ignoredNamespaces: tt.fields.ignoredNamespaces,
|
||||
}
|
||||
if got := c.resourceInIgnoredNamespace(tt.args.raw); got != tt.want {
|
||||
t.Errorf("Controller.resourceInIgnoredNamespace() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
t.Run(
|
||||
tt.name, func(t *testing.T) {
|
||||
c := &Controller{
|
||||
client: tt.fields.client,
|
||||
indexer: tt.fields.indexer,
|
||||
queue: tt.fields.queue,
|
||||
informer: tt.fields.informer,
|
||||
namespace: tt.fields.namespace,
|
||||
ignoredNamespaces: tt.fields.ignoredNamespaces,
|
||||
}
|
||||
if got := c.resourceInIgnoredNamespace(tt.args.raw); got != tt.want {
|
||||
t.Errorf("Controller.resourceInIgnoredNamespace() = %v, want %v", got, tt.want)
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2331,35 +2333,37 @@ func TestController_resourceInNamespaceSelector(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fakeClient := fake.NewSimpleClientset()
|
||||
namespace, _ := fakeClient.CoreV1().Namespaces().Create(context.Background(), &tt.fields.namespace, metav1.CreateOptions{})
|
||||
logrus.Infof("created fakeClient namespace for testing = %s", namespace.Name)
|
||||
t.Run(
|
||||
tt.name, func(t *testing.T) {
|
||||
fakeClient := fake.NewSimpleClientset()
|
||||
namespace, _ := fakeClient.CoreV1().Namespaces().Create(context.Background(), &tt.fields.namespace, metav1.CreateOptions{})
|
||||
logrus.Infof("created fakeClient namespace for testing = %s", namespace.Name)
|
||||
|
||||
c := &Controller{
|
||||
client: fakeClient,
|
||||
indexer: tt.fields.indexer,
|
||||
queue: tt.fields.queue,
|
||||
informer: tt.fields.informer,
|
||||
namespace: tt.fields.namespace.Name,
|
||||
namespaceSelector: tt.fields.namespaceSelector,
|
||||
}
|
||||
c := &Controller{
|
||||
client: fakeClient,
|
||||
indexer: tt.fields.indexer,
|
||||
queue: tt.fields.queue,
|
||||
informer: tt.fields.informer,
|
||||
namespace: tt.fields.namespace.Name,
|
||||
namespaceSelector: tt.fields.namespaceSelector,
|
||||
}
|
||||
|
||||
listOptions := metav1.ListOptions{}
|
||||
listOptions.LabelSelector = tt.fields.namespaceSelector
|
||||
namespaces, _ := fakeClient.CoreV1().Namespaces().List(context.Background(), listOptions)
|
||||
listOptions := metav1.ListOptions{}
|
||||
listOptions.LabelSelector = tt.fields.namespaceSelector
|
||||
namespaces, _ := fakeClient.CoreV1().Namespaces().List(context.Background(), listOptions)
|
||||
|
||||
for _, ns := range namespaces.Items {
|
||||
c.addSelectedNamespaceToCache(ns)
|
||||
}
|
||||
for _, ns := range namespaces.Items {
|
||||
c.addSelectedNamespaceToCache(ns)
|
||||
}
|
||||
|
||||
if got := c.resourceInSelectedNamespaces(tt.args.raw); got != tt.want {
|
||||
t.Errorf("Controller.resourceInNamespaceSelector() = %v, want %v", got, tt.want)
|
||||
}
|
||||
if got := c.resourceInSelectedNamespaces(tt.args.raw); got != tt.want {
|
||||
t.Errorf("Controller.resourceInNamespaceSelector() = %v, want %v", got, tt.want)
|
||||
}
|
||||
|
||||
for _, ns := range namespaces.Items {
|
||||
c.removeSelectedNamespaceFromCache(ns)
|
||||
}
|
||||
})
|
||||
for _, ns := range namespaces.Items {
|
||||
c.removeSelectedNamespaceFromCache(ns)
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/stakater/Reloader/internal/pkg/metrics"
|
||||
"github.com/stakater/Reloader/internal/pkg/options"
|
||||
@@ -11,23 +13,45 @@ import (
|
||||
|
||||
// ResourceCreatedHandler contains new objects
|
||||
type ResourceCreatedHandler struct {
|
||||
Resource interface{}
|
||||
Collectors metrics.Collectors
|
||||
Recorder record.EventRecorder
|
||||
Resource interface{}
|
||||
Collectors metrics.Collectors
|
||||
Recorder record.EventRecorder
|
||||
EnqueueTime time.Time // Time when this handler was added to the queue
|
||||
}
|
||||
|
||||
// GetEnqueueTime returns when this handler was enqueued
|
||||
func (r ResourceCreatedHandler) GetEnqueueTime() time.Time {
|
||||
return r.EnqueueTime
|
||||
}
|
||||
|
||||
// Handle processes the newly created resource
|
||||
func (r ResourceCreatedHandler) Handle() error {
|
||||
startTime := time.Now()
|
||||
result := "success"
|
||||
|
||||
defer func() {
|
||||
r.Collectors.RecordReconcile(result, time.Since(startTime))
|
||||
}()
|
||||
|
||||
if r.Resource == nil {
|
||||
logrus.Errorf("Resource creation handler received nil resource")
|
||||
result = "error"
|
||||
} else {
|
||||
config, _ := r.GetConfig()
|
||||
// Send webhook
|
||||
if options.WebhookUrl != "" {
|
||||
return sendUpgradeWebhook(config, options.WebhookUrl)
|
||||
err := sendUpgradeWebhook(config, options.WebhookUrl)
|
||||
if err != nil {
|
||||
result = "error"
|
||||
}
|
||||
return err
|
||||
}
|
||||
// process resource based on its type
|
||||
return doRollingUpgrade(config, r.Collectors, r.Recorder, invokeReloadStrategy)
|
||||
err := doRollingUpgrade(config, r.Collectors, r.Recorder, invokeReloadStrategy)
|
||||
if err != nil {
|
||||
result = "error"
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package handler
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/stakater/Reloader/internal/pkg/callbacks"
|
||||
@@ -20,23 +21,45 @@ import (
|
||||
|
||||
// ResourceDeleteHandler contains new objects
|
||||
type ResourceDeleteHandler struct {
|
||||
Resource interface{}
|
||||
Collectors metrics.Collectors
|
||||
Recorder record.EventRecorder
|
||||
Resource interface{}
|
||||
Collectors metrics.Collectors
|
||||
Recorder record.EventRecorder
|
||||
EnqueueTime time.Time // Time when this handler was added to the queue
|
||||
}
|
||||
|
||||
// GetEnqueueTime returns when this handler was enqueued
|
||||
func (r ResourceDeleteHandler) GetEnqueueTime() time.Time {
|
||||
return r.EnqueueTime
|
||||
}
|
||||
|
||||
// Handle processes resources being deleted
|
||||
func (r ResourceDeleteHandler) Handle() error {
|
||||
startTime := time.Now()
|
||||
result := "success"
|
||||
|
||||
defer func() {
|
||||
r.Collectors.RecordReconcile(result, time.Since(startTime))
|
||||
}()
|
||||
|
||||
if r.Resource == nil {
|
||||
logrus.Errorf("Resource delete handler received nil resource")
|
||||
result = "error"
|
||||
} else {
|
||||
config, _ := r.GetConfig()
|
||||
// Send webhook
|
||||
if options.WebhookUrl != "" {
|
||||
return sendUpgradeWebhook(config, options.WebhookUrl)
|
||||
err := sendUpgradeWebhook(config, options.WebhookUrl)
|
||||
if err != nil {
|
||||
result = "error"
|
||||
}
|
||||
return err
|
||||
}
|
||||
// process resource based on its type
|
||||
return doRollingUpgrade(config, r.Collectors, r.Recorder, invokeDeleteStrategy)
|
||||
err := doRollingUpgrade(config, r.Collectors, r.Recorder, invokeDeleteStrategy)
|
||||
if err != nil {
|
||||
result = "error"
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,9 +1,18 @@
|
||||
package handler
|
||||
|
||||
import "github.com/stakater/Reloader/pkg/common"
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/stakater/Reloader/pkg/common"
|
||||
)
|
||||
|
||||
// ResourceHandler handles the creation and update of resources
|
||||
type ResourceHandler interface {
|
||||
Handle() error
|
||||
GetConfig() (common.Config, string)
|
||||
}
|
||||
|
||||
// TimedHandler is a handler that tracks when it was enqueued
|
||||
type TimedHandler interface {
|
||||
GetEnqueueTime() time.Time
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/stakater/Reloader/internal/pkg/metrics"
|
||||
"github.com/stakater/Reloader/internal/pkg/options"
|
||||
@@ -16,21 +18,47 @@ type ResourceUpdatedHandler struct {
|
||||
OldResource interface{}
|
||||
Collectors metrics.Collectors
|
||||
Recorder record.EventRecorder
|
||||
EnqueueTime time.Time // Time when this handler was added to the queue
|
||||
}
|
||||
|
||||
// GetEnqueueTime returns when this handler was enqueued
|
||||
func (r ResourceUpdatedHandler) GetEnqueueTime() time.Time {
|
||||
return r.EnqueueTime
|
||||
}
|
||||
|
||||
// Handle processes the updated resource
|
||||
func (r ResourceUpdatedHandler) Handle() error {
|
||||
startTime := time.Now()
|
||||
result := "success"
|
||||
|
||||
defer func() {
|
||||
r.Collectors.RecordReconcile(result, time.Since(startTime))
|
||||
}()
|
||||
|
||||
if r.Resource == nil || r.OldResource == nil {
|
||||
logrus.Errorf("Resource update handler received nil resource")
|
||||
result = "error"
|
||||
} else {
|
||||
config, oldSHAData := r.GetConfig()
|
||||
if config.SHAValue != oldSHAData {
|
||||
// Send a webhook if update
|
||||
if options.WebhookUrl != "" {
|
||||
return sendUpgradeWebhook(config, options.WebhookUrl)
|
||||
err := sendUpgradeWebhook(config, options.WebhookUrl)
|
||||
if err != nil {
|
||||
result = "error"
|
||||
}
|
||||
return err
|
||||
}
|
||||
// process resource based on its type
|
||||
return doRollingUpgrade(config, r.Collectors, r.Recorder, invokeReloadStrategy)
|
||||
err := doRollingUpgrade(config, r.Collectors, r.Recorder, invokeReloadStrategy)
|
||||
if err != nil {
|
||||
result = "error"
|
||||
}
|
||||
return err
|
||||
} else {
|
||||
// No data change - skip
|
||||
result = "skipped"
|
||||
r.Collectors.RecordSkipped("no_data_change")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/parnurzeal/gorequest"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
@@ -236,23 +237,34 @@ func rollingUpgrade(clients kube.Clients, config common.Config, upgradeFuncs cal
|
||||
func PerformAction(clients kube.Clients, config common.Config, upgradeFuncs callbacks.RollingUpgradeFuncs, collectors metrics.Collectors, recorder record.EventRecorder, strategy invokeStrategy) error {
|
||||
items := upgradeFuncs.ItemsFunc(clients, config.Namespace)
|
||||
|
||||
// Record workloads scanned
|
||||
collectors.RecordWorkloadsScanned(upgradeFuncs.ResourceType, len(items))
|
||||
|
||||
matchedCount := 0
|
||||
for _, item := range items {
|
||||
err := retryOnConflict(retry.DefaultRetry, func(fetchResource bool) error {
|
||||
return upgradeResource(clients, config, upgradeFuncs, collectors, recorder, strategy, item, fetchResource)
|
||||
err := retryOnConflict(retry.DefaultRetry, func(fetchResource bool) (bool, error) {
|
||||
matched, err := upgradeResource(clients, config, upgradeFuncs, collectors, recorder, strategy, item, fetchResource)
|
||||
if matched {
|
||||
matchedCount++
|
||||
}
|
||||
return matched, err
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Record workloads matched
|
||||
collectors.RecordWorkloadsMatched(upgradeFuncs.ResourceType, matchedCount)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func retryOnConflict(backoff wait.Backoff, fn func(_ bool) error) error {
|
||||
func retryOnConflict(backoff wait.Backoff, fn func(_ bool) (bool, error)) error {
|
||||
var lastError error
|
||||
fetchResource := false // do not fetch resource on first attempt, already done by ItemsFunc
|
||||
err := wait.ExponentialBackoff(backoff, func() (bool, error) {
|
||||
err := fn(fetchResource)
|
||||
_, err := fn(fetchResource)
|
||||
fetchResource = true
|
||||
switch {
|
||||
case err == nil:
|
||||
@@ -270,17 +282,19 @@ func retryOnConflict(backoff wait.Backoff, fn func(_ bool) error) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs callbacks.RollingUpgradeFuncs, collectors metrics.Collectors, recorder record.EventRecorder, strategy invokeStrategy, resource runtime.Object, fetchResource bool) error {
|
||||
func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs callbacks.RollingUpgradeFuncs, collectors metrics.Collectors, recorder record.EventRecorder, strategy invokeStrategy, resource runtime.Object, fetchResource bool) (bool, error) {
|
||||
actionStartTime := time.Now()
|
||||
|
||||
accessor, err := meta.Accessor(resource)
|
||||
if err != nil {
|
||||
return err
|
||||
return false, err
|
||||
}
|
||||
|
||||
resourceName := accessor.GetName()
|
||||
if fetchResource {
|
||||
resource, err = upgradeFuncs.ItemFunc(clients, resourceName, config.Namespace)
|
||||
if err != nil {
|
||||
return err
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
annotations := upgradeFuncs.AnnotationsFunc(resource)
|
||||
@@ -289,13 +303,14 @@ func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs ca
|
||||
|
||||
if !result.ShouldReload {
|
||||
logrus.Debugf("No changes detected in '%s' of type '%s' in namespace '%s'", config.ResourceName, config.Type, config.Namespace)
|
||||
return nil
|
||||
return false, nil
|
||||
}
|
||||
|
||||
strategyResult := strategy(upgradeFuncs, resource, config, result.AutoReload)
|
||||
|
||||
if strategyResult.Result != constants.Updated {
|
||||
return nil
|
||||
collectors.RecordSkipped("strategy_not_updated")
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// find correct annotation and update the resource
|
||||
@@ -309,7 +324,7 @@ func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs ca
|
||||
_, err = PauseDeployment(deployment, clients, config.Namespace, pauseInterval)
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed to pause deployment '%s' in namespace '%s': %v", resourceName, config.Namespace, err)
|
||||
return err
|
||||
return true, err
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -320,16 +335,19 @@ func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs ca
|
||||
err = upgradeFuncs.UpdateFunc(clients, config.Namespace, resource)
|
||||
}
|
||||
|
||||
actionLatency := time.Since(actionStartTime)
|
||||
|
||||
if err != nil {
|
||||
message := fmt.Sprintf("Update for '%s' of type '%s' in namespace '%s' failed with error %v", resourceName, upgradeFuncs.ResourceType, config.Namespace, err)
|
||||
logrus.Errorf("Update for '%s' of type '%s' in namespace '%s' failed with error %v", resourceName, upgradeFuncs.ResourceType, config.Namespace, err)
|
||||
|
||||
collectors.Reloaded.With(prometheus.Labels{"success": "false"}).Inc()
|
||||
collectors.ReloadedByNamespace.With(prometheus.Labels{"success": "false", "namespace": config.Namespace}).Inc()
|
||||
collectors.RecordAction(upgradeFuncs.ResourceType, "error", actionLatency)
|
||||
if recorder != nil {
|
||||
recorder.Event(resource, v1.EventTypeWarning, "ReloadFail", message)
|
||||
}
|
||||
return err
|
||||
return true, err
|
||||
} else {
|
||||
message := fmt.Sprintf("Changes detected in '%s' of type '%s' in namespace '%s'", config.ResourceName, config.Type, config.Namespace)
|
||||
message += fmt.Sprintf(", Updated '%s' of type '%s' in namespace '%s'", resourceName, upgradeFuncs.ResourceType, config.Namespace)
|
||||
@@ -338,6 +356,7 @@ func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs ca
|
||||
|
||||
collectors.Reloaded.With(prometheus.Labels{"success": "true"}).Inc()
|
||||
collectors.ReloadedByNamespace.With(prometheus.Labels{"success": "true", "namespace": config.Namespace}).Inc()
|
||||
collectors.RecordAction(upgradeFuncs.ResourceType, "success", actionLatency)
|
||||
alert_on_reload, ok := os.LookupEnv("ALERT_ON_RELOAD")
|
||||
if recorder != nil {
|
||||
recorder.Event(resource, v1.EventTypeNormal, "Reloaded", message)
|
||||
@@ -350,7 +369,7 @@ func upgradeResource(clients kube.Clients, config common.Config, upgradeFuncs ca
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getVolumeMountName(volumes []v1.Volume, mountType string, volumeName string) string {
|
||||
|
||||
@@ -1,54 +1,407 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"k8s.io/client-go/tools/metrics"
|
||||
)
|
||||
|
||||
// clientGoRequestMetrics implements metrics.LatencyMetric and metrics.ResultMetric
|
||||
// to expose client-go's rest_client_requests_total metric
|
||||
type clientGoRequestMetrics struct {
|
||||
requestCounter *prometheus.CounterVec
|
||||
requestLatency *prometheus.HistogramVec
|
||||
}
|
||||
|
||||
func (m *clientGoRequestMetrics) Increment(ctx context.Context, code string, method string, host string) {
|
||||
m.requestCounter.WithLabelValues(code, method, host).Inc()
|
||||
}
|
||||
|
||||
func (m *clientGoRequestMetrics) Observe(ctx context.Context, verb string, u url.URL, latency time.Duration) {
|
||||
m.requestLatency.WithLabelValues(verb, u.Host).Observe(latency.Seconds())
|
||||
}
|
||||
|
||||
var clientGoMetrics = &clientGoRequestMetrics{
|
||||
requestCounter: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "rest_client_requests_total",
|
||||
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
|
||||
},
|
||||
[]string{"code", "method", "host"},
|
||||
),
|
||||
requestLatency: prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "rest_client_request_duration_seconds",
|
||||
Help: "Request latency in seconds. Broken down by verb and host.",
|
||||
Buckets: []float64{0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30},
|
||||
},
|
||||
[]string{"verb", "host"},
|
||||
),
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Register the metrics collectors
|
||||
prometheus.MustRegister(clientGoMetrics.requestCounter)
|
||||
prometheus.MustRegister(clientGoMetrics.requestLatency)
|
||||
|
||||
// Register our metrics implementation with client-go
|
||||
metrics.RequestResult = clientGoMetrics
|
||||
metrics.RequestLatency = clientGoMetrics
|
||||
}
|
||||
|
||||
// Collectors holds all Prometheus metrics collectors for Reloader.
|
||||
type Collectors struct {
|
||||
// Existing metrics (preserved for backward compatibility)
|
||||
Reloaded *prometheus.CounterVec
|
||||
ReloadedByNamespace *prometheus.CounterVec
|
||||
countByNamespace bool
|
||||
|
||||
// Reconcile/Handler metrics
|
||||
ReconcileTotal *prometheus.CounterVec // Total reconcile calls by result
|
||||
ReconcileDuration *prometheus.HistogramVec // Time spent in reconcile/handler
|
||||
|
||||
// Action metrics
|
||||
ActionTotal *prometheus.CounterVec // Total actions by workload kind and result
|
||||
ActionLatency *prometheus.HistogramVec // Time from event to action applied
|
||||
|
||||
// Skip metrics
|
||||
SkippedTotal *prometheus.CounterVec // Skipped operations by reason
|
||||
|
||||
// Queue metrics
|
||||
QueueDepth prometheus.Gauge // Current queue depth
|
||||
QueueAdds prometheus.Counter // Total items added to queue
|
||||
QueueLatency *prometheus.HistogramVec // Time spent in queue
|
||||
|
||||
// Error and retry metrics
|
||||
ErrorsTotal *prometheus.CounterVec // Errors by type
|
||||
RetriesTotal prometheus.Counter // Total retries
|
||||
|
||||
// Event processing metrics
|
||||
EventsReceived *prometheus.CounterVec // Events received by type (add/update/delete)
|
||||
EventsProcessed *prometheus.CounterVec // Events processed by type and result
|
||||
|
||||
// Resource discovery metrics
|
||||
WorkloadsScanned *prometheus.CounterVec // Workloads scanned by kind
|
||||
WorkloadsMatched *prometheus.CounterVec // Workloads matched for reload by kind
|
||||
}
|
||||
|
||||
// RecordReload records a reload event with the given success status and namespace.
|
||||
// Preserved for backward compatibility.
|
||||
func (c *Collectors) RecordReload(success bool, namespace string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
|
||||
successLabel := "false"
|
||||
if success {
|
||||
successLabel = "true"
|
||||
}
|
||||
|
||||
c.Reloaded.With(prometheus.Labels{"success": successLabel}).Inc()
|
||||
|
||||
if c.countByNamespace {
|
||||
c.ReloadedByNamespace.With(prometheus.Labels{
|
||||
"success": successLabel,
|
||||
"namespace": namespace,
|
||||
}).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RecordReconcile records a reconcile/handler invocation.
|
||||
func (c *Collectors) RecordReconcile(result string, duration time.Duration) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.ReconcileTotal.With(prometheus.Labels{"result": result}).Inc()
|
||||
c.ReconcileDuration.With(prometheus.Labels{"result": result}).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// RecordAction records a reload action on a workload.
|
||||
func (c *Collectors) RecordAction(workloadKind string, result string, latency time.Duration) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.ActionTotal.With(prometheus.Labels{"workload_kind": workloadKind, "result": result}).Inc()
|
||||
c.ActionLatency.With(prometheus.Labels{"workload_kind": workloadKind}).Observe(latency.Seconds())
|
||||
}
|
||||
|
||||
// RecordSkipped records a skipped operation with reason.
|
||||
func (c *Collectors) RecordSkipped(reason string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.SkippedTotal.With(prometheus.Labels{"reason": reason}).Inc()
|
||||
}
|
||||
|
||||
// RecordQueueAdd records an item being added to the queue.
|
||||
func (c *Collectors) RecordQueueAdd() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.QueueAdds.Inc()
|
||||
}
|
||||
|
||||
// SetQueueDepth sets the current queue depth.
|
||||
func (c *Collectors) SetQueueDepth(depth int) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.QueueDepth.Set(float64(depth))
|
||||
}
|
||||
|
||||
// RecordQueueLatency records how long an item spent in the queue.
|
||||
func (c *Collectors) RecordQueueLatency(latency time.Duration) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.QueueLatency.With(prometheus.Labels{}).Observe(latency.Seconds())
|
||||
}
|
||||
|
||||
// RecordError records an error by type.
|
||||
func (c *Collectors) RecordError(errorType string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.ErrorsTotal.With(prometheus.Labels{"type": errorType}).Inc()
|
||||
}
|
||||
|
||||
// RecordRetry records a retry attempt.
|
||||
func (c *Collectors) RecordRetry() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.RetriesTotal.Inc()
|
||||
}
|
||||
|
||||
// RecordEventReceived records an event being received.
|
||||
func (c *Collectors) RecordEventReceived(eventType string, resourceType string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.EventsReceived.With(prometheus.Labels{"event_type": eventType, "resource_type": resourceType}).Inc()
|
||||
}
|
||||
|
||||
// RecordEventProcessed records an event being processed.
|
||||
func (c *Collectors) RecordEventProcessed(eventType string, resourceType string, result string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.EventsProcessed.With(prometheus.Labels{"event_type": eventType, "resource_type": resourceType, "result": result}).Inc()
|
||||
}
|
||||
|
||||
// RecordWorkloadsScanned records workloads scanned during a reconcile.
|
||||
func (c *Collectors) RecordWorkloadsScanned(kind string, count int) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.WorkloadsScanned.With(prometheus.Labels{"kind": kind}).Add(float64(count))
|
||||
}
|
||||
|
||||
// RecordWorkloadsMatched records workloads matched for reload.
|
||||
func (c *Collectors) RecordWorkloadsMatched(kind string, count int) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.WorkloadsMatched.With(prometheus.Labels{"kind": kind}).Add(float64(count))
|
||||
}
|
||||
|
||||
func NewCollectors() Collectors {
|
||||
// Existing metrics (preserved)
|
||||
reloaded := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "reload_executed_total",
|
||||
Help: "Counter of reloads executed by Reloader.",
|
||||
},
|
||||
[]string{
|
||||
"success",
|
||||
},
|
||||
[]string{"success"},
|
||||
)
|
||||
|
||||
//set 0 as default value
|
||||
reloaded.With(prometheus.Labels{"success": "true"}).Add(0)
|
||||
reloaded.With(prometheus.Labels{"success": "false"}).Add(0)
|
||||
|
||||
reloaded_by_namespace := prometheus.NewCounterVec(
|
||||
reloadedByNamespace := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "reload_executed_total_by_namespace",
|
||||
Help: "Counter of reloads executed by Reloader by namespace.",
|
||||
},
|
||||
[]string{
|
||||
"success",
|
||||
"namespace",
|
||||
[]string{"success", "namespace"},
|
||||
)
|
||||
|
||||
// === NEW: Comprehensive metrics ===
|
||||
|
||||
reconcileTotal := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "reconcile_total",
|
||||
Help: "Total number of reconcile/handler invocations by result.",
|
||||
},
|
||||
[]string{"result"},
|
||||
)
|
||||
|
||||
reconcileDuration := prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "reconcile_duration_seconds",
|
||||
Help: "Time spent in reconcile/handler in seconds.",
|
||||
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
|
||||
},
|
||||
[]string{"result"},
|
||||
)
|
||||
|
||||
actionTotal := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "action_total",
|
||||
Help: "Total number of reload actions by workload kind and result.",
|
||||
},
|
||||
[]string{"workload_kind", "result"},
|
||||
)
|
||||
|
||||
actionLatency := prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "action_latency_seconds",
|
||||
Help: "Time from event received to action applied in seconds.",
|
||||
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
|
||||
},
|
||||
[]string{"workload_kind"},
|
||||
)
|
||||
|
||||
skippedTotal := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "skipped_total",
|
||||
Help: "Total number of skipped operations by reason.",
|
||||
},
|
||||
[]string{"reason"},
|
||||
)
|
||||
|
||||
queueDepth := prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "workqueue_depth",
|
||||
Help: "Current depth of the work queue.",
|
||||
},
|
||||
)
|
||||
|
||||
queueAdds := prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "workqueue_adds_total",
|
||||
Help: "Total number of items added to the work queue.",
|
||||
},
|
||||
)
|
||||
|
||||
queueLatency := prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "workqueue_latency_seconds",
|
||||
Help: "Time spent in the work queue in seconds.",
|
||||
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
|
||||
},
|
||||
[]string{},
|
||||
)
|
||||
|
||||
errorsTotal := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "errors_total",
|
||||
Help: "Total number of errors by type.",
|
||||
},
|
||||
[]string{"type"},
|
||||
)
|
||||
|
||||
retriesTotal := prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "retries_total",
|
||||
Help: "Total number of retry attempts.",
|
||||
},
|
||||
)
|
||||
|
||||
eventsReceived := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "events_received_total",
|
||||
Help: "Total number of events received by type and resource.",
|
||||
},
|
||||
[]string{"event_type", "resource_type"},
|
||||
)
|
||||
|
||||
eventsProcessed := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "events_processed_total",
|
||||
Help: "Total number of events processed by type, resource, and result.",
|
||||
},
|
||||
[]string{"event_type", "resource_type", "result"},
|
||||
)
|
||||
|
||||
workloadsScanned := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "workloads_scanned_total",
|
||||
Help: "Total number of workloads scanned by kind.",
|
||||
},
|
||||
[]string{"kind"},
|
||||
)
|
||||
|
||||
workloadsMatched := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "reloader",
|
||||
Name: "workloads_matched_total",
|
||||
Help: "Total number of workloads matched for reload by kind.",
|
||||
},
|
||||
[]string{"kind"},
|
||||
)
|
||||
|
||||
return Collectors{
|
||||
Reloaded: reloaded,
|
||||
ReloadedByNamespace: reloaded_by_namespace,
|
||||
ReloadedByNamespace: reloadedByNamespace,
|
||||
countByNamespace: os.Getenv("METRICS_COUNT_BY_NAMESPACE") == "enabled",
|
||||
|
||||
ReconcileTotal: reconcileTotal,
|
||||
ReconcileDuration: reconcileDuration,
|
||||
ActionTotal: actionTotal,
|
||||
ActionLatency: actionLatency,
|
||||
SkippedTotal: skippedTotal,
|
||||
QueueDepth: queueDepth,
|
||||
QueueAdds: queueAdds,
|
||||
QueueLatency: queueLatency,
|
||||
ErrorsTotal: errorsTotal,
|
||||
RetriesTotal: retriesTotal,
|
||||
EventsReceived: eventsReceived,
|
||||
EventsProcessed: eventsProcessed,
|
||||
WorkloadsScanned: workloadsScanned,
|
||||
WorkloadsMatched: workloadsMatched,
|
||||
}
|
||||
}
|
||||
|
||||
func SetupPrometheusEndpoint() Collectors {
|
||||
collectors := NewCollectors()
|
||||
|
||||
// Register all metrics
|
||||
prometheus.MustRegister(collectors.Reloaded)
|
||||
prometheus.MustRegister(collectors.ReconcileTotal)
|
||||
prometheus.MustRegister(collectors.ReconcileDuration)
|
||||
prometheus.MustRegister(collectors.ActionTotal)
|
||||
prometheus.MustRegister(collectors.ActionLatency)
|
||||
prometheus.MustRegister(collectors.SkippedTotal)
|
||||
prometheus.MustRegister(collectors.QueueDepth)
|
||||
prometheus.MustRegister(collectors.QueueAdds)
|
||||
prometheus.MustRegister(collectors.QueueLatency)
|
||||
prometheus.MustRegister(collectors.ErrorsTotal)
|
||||
prometheus.MustRegister(collectors.RetriesTotal)
|
||||
prometheus.MustRegister(collectors.EventsReceived)
|
||||
prometheus.MustRegister(collectors.EventsProcessed)
|
||||
prometheus.MustRegister(collectors.WorkloadsScanned)
|
||||
prometheus.MustRegister(collectors.WorkloadsMatched)
|
||||
|
||||
if os.Getenv("METRICS_COUNT_BY_NAMESPACE") == "enabled" {
|
||||
prometheus.MustRegister(collectors.ReloadedByNamespace)
|
||||
|
||||
544
test/loadtest/README.md
Normal file
544
test/loadtest/README.md
Normal file
@@ -0,0 +1,544 @@
|
||||
# Reloader Load Test Framework
|
||||
|
||||
This framework provides A/B comparison testing between two Reloader container images.
|
||||
|
||||
## Overview
|
||||
|
||||
The load test framework:
|
||||
1. Creates a local kind cluster (1 control-plane + 6 worker nodes)
|
||||
2. Deploys Prometheus for metrics collection
|
||||
3. Loads the provided Reloader container images into the cluster
|
||||
4. Runs standardized test scenarios (S1-S13)
|
||||
5. Collects metrics via Prometheus scraping
|
||||
6. Generates comparison reports with pass/fail criteria
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Docker or Podman
|
||||
- kind (Kubernetes in Docker)
|
||||
- kubectl
|
||||
- Go 1.22+
|
||||
|
||||
## Building
|
||||
|
||||
```bash
|
||||
cd test/loadtest
|
||||
go build -o loadtest ./cmd/loadtest
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Compare two published images (e.g., different versions)
|
||||
./loadtest run \
|
||||
--old-image=stakater/reloader:v1.0.0 \
|
||||
--new-image=stakater/reloader:v1.1.0
|
||||
|
||||
# Run a specific scenario
|
||||
./loadtest run \
|
||||
--old-image=stakater/reloader:v1.0.0 \
|
||||
--new-image=stakater/reloader:v1.1.0 \
|
||||
--scenario=S2 \
|
||||
--duration=120
|
||||
|
||||
# Test only a single image (no comparison)
|
||||
./loadtest run --new-image=myregistry/reloader:dev
|
||||
|
||||
# Use local images built with docker/podman
|
||||
./loadtest run \
|
||||
--old-image=localhost/reloader:baseline \
|
||||
--new-image=localhost/reloader:feature-branch
|
||||
|
||||
# Skip cluster creation (use existing kind cluster)
|
||||
./loadtest run \
|
||||
--old-image=stakater/reloader:v1.0.0 \
|
||||
--new-image=stakater/reloader:v1.1.0 \
|
||||
--skip-cluster
|
||||
|
||||
# Run all scenarios in parallel on 4 clusters (faster execution)
|
||||
./loadtest run \
|
||||
--new-image=localhost/reloader:dev \
|
||||
--parallelism=4
|
||||
|
||||
# Run all 13 scenarios in parallel (one cluster per scenario)
|
||||
./loadtest run \
|
||||
--new-image=localhost/reloader:dev \
|
||||
--parallelism=13
|
||||
|
||||
# Generate report from existing results
|
||||
./loadtest report --scenario=S2 --results-dir=./results
|
||||
```
|
||||
|
||||
## Command Line Options
|
||||
|
||||
### Run Command
|
||||
|
||||
| Option | Description | Default |
|
||||
|--------|-------------|---------|
|
||||
| `--old-image=IMAGE` | Container image for "old" version | - |
|
||||
| `--new-image=IMAGE` | Container image for "new" version | - |
|
||||
| `--scenario=ID` | Test scenario: S1-S13 or "all" | all |
|
||||
| `--duration=SECONDS` | Test duration in seconds | 60 |
|
||||
| `--parallelism=N` | Run N scenarios in parallel on N kind clusters | 1 |
|
||||
| `--skip-cluster` | Skip kind cluster creation (use existing, only for parallelism=1) | false |
|
||||
| `--results-dir=DIR` | Directory for results | ./results |
|
||||
|
||||
**Note:** At least one of `--old-image` or `--new-image` is required. Provide both for A/B comparison.
|
||||
|
||||
### Report Command
|
||||
|
||||
| Option | Description | Default |
|
||||
|--------|-------------|---------|
|
||||
| `--scenario=ID` | Scenario to report on (required) | - |
|
||||
| `--results-dir=DIR` | Directory containing results | ./results |
|
||||
| `--output=FILE` | Output file (default: stdout) | - |
|
||||
|
||||
## Test Scenarios
|
||||
|
||||
| ID | Name | Description |
|
||||
|-----|-----------------------|-------------------------------------------------|
|
||||
| S1 | Burst Updates | Many ConfigMap/Secret updates in quick succession |
|
||||
| S2 | Fan-Out | One ConfigMap used by many (50) workloads |
|
||||
| S3 | High Cardinality | Many CMs/Secrets across many namespaces |
|
||||
| S4 | No-Op Updates | Updates that don't change data (annotation only)|
|
||||
| S5 | Workload Churn | Deployments created/deleted rapidly |
|
||||
| S6 | Controller Restart | Restart controller pod under load |
|
||||
| S7 | API Pressure | Many concurrent update requests |
|
||||
| S8 | Large Objects | ConfigMaps > 100KB |
|
||||
| S9 | Multi-Workload Types | Tests all workload types (Deploy, STS, DS) |
|
||||
| S10 | Secrets + Mixed | Secrets and mixed ConfigMap+Secret workloads |
|
||||
| S11 | Annotation Strategy | Tests `--reload-strategy=annotations` |
|
||||
| S12 | Pause & Resume | Tests pause-period during rapid updates |
|
||||
| S13 | Complex References | Init containers, valueFrom, projected volumes |
|
||||
|
||||
## Metrics Reference
|
||||
|
||||
This section explains each metric collected during load tests, what it measures, and what different values might indicate.
|
||||
|
||||
### Counter Metrics (Totals)
|
||||
|
||||
#### `reconcile_total`
|
||||
**What it measures:** The total number of reconciliation loops executed by the controller.
|
||||
|
||||
**What it indicates:**
|
||||
- **Higher in new vs old:** The new controller-runtime implementation may batch events differently. This is often expected behavior, not a problem.
|
||||
- **Lower in new vs old:** Better event batching/deduplication. Controller-runtime's work queue naturally deduplicates events.
|
||||
- **Expected behavior:** The new implementation typically has *fewer* reconciles due to intelligent event batching.
|
||||
|
||||
#### `action_total`
|
||||
**What it measures:** The total number of reload actions triggered (rolling restarts of Deployments/StatefulSets/DaemonSets).
|
||||
|
||||
**What it indicates:**
|
||||
- **Should match expected value:** Both implementations should trigger the same number of reloads for the same workload.
|
||||
- **Lower than expected:** Some updates were missed - potential bug or race condition.
|
||||
- **Higher than expected:** Duplicate reloads triggered - inefficiency but not data loss.
|
||||
|
||||
#### `reload_executed_total`
|
||||
**What it measures:** Successful reload operations executed, labeled by `success=true/false`.
|
||||
|
||||
**What it indicates:**
|
||||
- **`success=true` count:** Number of workloads successfully restarted.
|
||||
- **`success=false` count:** Failed restart attempts (API errors, permission issues).
|
||||
- **Should match `action_total`:** If significantly lower, reloads are failing.
|
||||
|
||||
#### `workloads_scanned_total`
|
||||
**What it measures:** Number of workloads (Deployments, etc.) scanned when checking for ConfigMap/Secret references.
|
||||
|
||||
**What it indicates:**
|
||||
- **High count:** Controller is scanning many workloads per reconcile.
|
||||
- **Expected behavior:** Should roughly match the number of workloads × number of reconciles.
|
||||
- **Optimization signal:** If very high, namespace filtering or label selectors could help.
|
||||
|
||||
#### `workloads_matched_total`
|
||||
**What it measures:** Number of workloads that matched (reference the changed ConfigMap/Secret).
|
||||
|
||||
**What it indicates:**
|
||||
- **Should match `reload_executed_total`:** Every matched workload should be reloaded.
|
||||
- **Higher than reloads:** Some matched workloads weren't reloaded (potential issue).
|
||||
|
||||
#### `errors_total`
|
||||
**What it measures:** Total errors encountered, labeled by error type.
|
||||
|
||||
**What it indicates:**
|
||||
- **Should be 0:** Any errors indicate problems.
|
||||
- **Common causes:** API server timeouts, RBAC issues, resource conflicts.
|
||||
- **Critical metric:** Non-zero errors in production should be investigated.
|
||||
|
||||
### API Efficiency Metrics (REST Client)
|
||||
|
||||
These metrics track Kubernetes API server calls made by Reloader. Lower values indicate more efficient operation with less API server load.
|
||||
|
||||
#### `rest_client_requests_total`
|
||||
**What it measures:** Total number of HTTP requests made to the Kubernetes API server.
|
||||
|
||||
**What it indicates:**
|
||||
- **Lower is better:** Fewer API calls means less load on the API server.
|
||||
- **High count:** May indicate inefficient caching or excessive reconciles.
|
||||
- **Comparison use:** Shows overall API efficiency between implementations.
|
||||
|
||||
#### `rest_client_requests_get`
|
||||
**What it measures:** Number of GET requests (fetching individual resources or listings).
|
||||
|
||||
**What it indicates:**
|
||||
- **Includes:** Fetching ConfigMaps, Secrets, Deployments, etc.
|
||||
- **Higher count:** More frequent resource fetching, possibly due to cache misses.
|
||||
- **Expected behavior:** Controller-runtime's caching should reduce GET requests compared to direct API calls.
|
||||
|
||||
#### `rest_client_requests_patch`
|
||||
**What it measures:** Number of PATCH requests (partial updates to resources).
|
||||
|
||||
**What it indicates:**
|
||||
- **Used for:** Rolling restart annotations on workloads.
|
||||
- **Should correlate with:** `reload_executed_total` - each reload typically requires one PATCH.
|
||||
- **Lower is better:** Fewer patches means more efficient batching or deduplication.
|
||||
|
||||
#### `rest_client_requests_put`
|
||||
**What it measures:** Number of PUT requests (full resource updates).
|
||||
|
||||
**What it indicates:**
|
||||
- **Used for:** Full object replacements (less common than PATCH).
|
||||
- **Should be low:** Most updates use PATCH for efficiency.
|
||||
- **High count:** May indicate suboptimal update strategy.
|
||||
|
||||
#### `rest_client_requests_errors`
|
||||
**What it measures:** Number of failed API requests (4xx/5xx responses).
|
||||
|
||||
**What it indicates:**
|
||||
- **Should be 0:** Errors indicate API server issues or permission problems.
|
||||
- **Common causes:** Rate limiting, RBAC issues, resource conflicts, network issues.
|
||||
- **Non-zero:** Investigate API server logs and Reloader permissions.
|
||||
|
||||
### Latency Metrics (Percentiles)
|
||||
|
||||
All latency metrics are reported in **seconds**. The report shows p50 (median), p95, and p99 percentiles.
|
||||
|
||||
#### `reconcile_duration (s)`
|
||||
**What it measures:** Time spent inside each reconcile loop, from start to finish.
|
||||
|
||||
**What it indicates:**
|
||||
- **p50 (median):** Typical reconcile time. Should be < 100ms for good performance.
|
||||
- **p95:** 95th percentile - only 5% of reconciles take longer than this.
|
||||
- **p99:** 99th percentile - indicates worst-case performance.
|
||||
|
||||
**Interpreting differences:**
|
||||
- **New higher than old:** Controller-runtime reconciles may do more work per loop but run fewer times. Check `reconcile_total` - if it's lower, this is expected.
|
||||
- **Minor differences (< 0.5s absolute):** Not significant for sub-second values.
|
||||
|
||||
#### `action_latency (s)`
|
||||
**What it measures:** End-to-end time from ConfigMap/Secret change detection to workload restart triggered.
|
||||
|
||||
**What it indicates:**
|
||||
- **This is the user-facing latency:** How long users wait for their config changes to take effect.
|
||||
- **p50 < 1s:** Excellent - most changes apply within a second.
|
||||
- **p95 < 5s:** Good - even under load, changes apply quickly.
|
||||
- **p99 > 10s:** May need investigation - some changes take too long.
|
||||
|
||||
**What affects this:**
|
||||
- API server responsiveness
|
||||
- Number of workloads to scan
|
||||
- Concurrent updates competing for resources
|
||||
|
||||
### Understanding the Report
|
||||
|
||||
#### Report Columns
|
||||
|
||||
```
|
||||
Metric Old New Expected Old✓ New✓ Status
|
||||
------ --- --- -------- ---- ---- ------
|
||||
action_total 100.00 100.00 100 ✓ ✓ pass
|
||||
action_latency_p95 (s) 0.15 0.04 - - - pass
|
||||
```
|
||||
|
||||
- **Old/New:** Measured values from each implementation
|
||||
- **Expected:** Known expected value (for throughput metrics)
|
||||
- **Old✓/New✓:** Whether the value is within 15% of expected (✓ = yes, ✗ = no, - = no expected value)
|
||||
- **Status:** pass/fail based on comparison thresholds
|
||||
|
||||
#### Pass/Fail Logic
|
||||
|
||||
| Metric Type | Pass Condition |
|
||||
|-------------|----------------|
|
||||
| Throughput (action_total, reload_executed_total) | New value within 15% of expected |
|
||||
| Latency (p50, p95, p99) | New not more than threshold% worse than old, OR absolute difference < minimum threshold |
|
||||
| Errors | New ≤ Old (ideally both 0) |
|
||||
| API Efficiency (rest_client_requests_*) | New ≤ Old (lower is better), or New not more than 50% higher |
|
||||
|
||||
#### Latency Thresholds
|
||||
|
||||
Latency comparisons use both percentage AND absolute thresholds to avoid false failures:
|
||||
|
||||
| Metric | Max % Worse | Min Absolute Diff |
|
||||
|--------|-------------|-------------------|
|
||||
| p50 | 100% | 0.5s |
|
||||
| p95 | 100% | 1.0s |
|
||||
| p99 | 100% | 1.0s |
|
||||
|
||||
**Example:** If old p50 = 0.01s and new p50 = 0.08s:
|
||||
- Percentage difference: +700% (would fail % check)
|
||||
- Absolute difference: 0.07s (< 0.5s threshold)
|
||||
- **Result: PASS** (both values are fast enough that the difference doesn't matter)
|
||||
|
||||
### Resource Consumption Metrics
|
||||
|
||||
These metrics track CPU, memory, and Go runtime resource usage. Lower values generally indicate more efficient operation.
|
||||
|
||||
#### Memory Metrics
|
||||
|
||||
| Metric | Description | Unit |
|
||||
|--------|-------------|------|
|
||||
| `memory_rss_mb_avg` | Average RSS (resident set size) memory | MB |
|
||||
| `memory_rss_mb_max` | Peak RSS memory during test | MB |
|
||||
| `memory_heap_mb_avg` | Average Go heap allocation | MB |
|
||||
| `memory_heap_mb_max` | Peak Go heap allocation | MB |
|
||||
|
||||
**What to watch for:**
|
||||
- **High RSS:** May indicate memory leaks or inefficient caching
|
||||
- **High heap:** Many objects being created (check GC metrics)
|
||||
- **Growing over time:** Potential memory leak
|
||||
|
||||
#### CPU Metrics
|
||||
|
||||
| Metric | Description | Unit |
|
||||
|--------|-------------|------|
|
||||
| `cpu_cores_avg` | Average CPU usage rate | cores |
|
||||
| `cpu_cores_max` | Peak CPU usage rate | cores |
|
||||
|
||||
**What to watch for:**
|
||||
- **High CPU:** Inefficient algorithms or excessive reconciles
|
||||
- **Spiky max:** May indicate burst handling issues
|
||||
|
||||
#### Go Runtime Metrics
|
||||
|
||||
| Metric | Description | Unit |
|
||||
|--------|-------------|------|
|
||||
| `goroutines_avg` | Average goroutine count | count |
|
||||
| `goroutines_max` | Peak goroutine count | count |
|
||||
| `gc_pause_p99_ms` | 99th percentile GC pause time | ms |
|
||||
|
||||
**What to watch for:**
|
||||
- **High goroutines:** Potential goroutine leak or unbounded concurrency
|
||||
- **High GC pause:** Large heap or allocation pressure
|
||||
|
||||
### Scenario-Specific Expectations
|
||||
|
||||
| Scenario | Key Metrics to Watch | Expected Behavior |
|
||||
|----------|---------------------|-------------------|
|
||||
| S1 (Burst) | action_latency_p99, cpu_cores_max, goroutines_max | Should handle bursts without queue backup |
|
||||
| S2 (Fan-Out) | reconcile_total, workloads_matched, memory_rss_mb_max | One CM change → 50 workload reloads |
|
||||
| S3 (High Cardinality) | reconcile_duration, memory_heap_mb_avg | Many namespaces shouldn't increase memory |
|
||||
| S4 (No-Op) | action_total = 0, cpu_cores_avg should be low | Minimal resource usage for no-op |
|
||||
| S5 (Churn) | errors_total, goroutines_avg | Graceful handling, no goroutine leak |
|
||||
| S6 (Restart) | All metrics captured | Metrics survive controller restart |
|
||||
| S7 (API Pressure) | errors_total, cpu_cores_max, goroutines_max | No errors under concurrent load |
|
||||
| S8 (Large Objects) | memory_rss_mb_max, gc_pause_p99_ms | Large ConfigMaps don't cause OOM or GC issues |
|
||||
| S9 (Multi-Workload) | reload_executed_total per type | All workload types (Deploy, STS, DS) reload |
|
||||
| S10 (Secrets) | reload_executed_total, workloads_matched | Both Secrets and ConfigMaps trigger reloads |
|
||||
| S11 (Annotation) | workload annotations present | Deployments get `last-reloaded-from` annotation |
|
||||
| S12 (Pause) | reload_executed_total << updates | Pause-period reduces reload frequency |
|
||||
| S13 (Complex) | reload_executed_total | All reference types trigger reloads |
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
#### New implementation shows 0 for all metrics
|
||||
- Check if Prometheus is scraping the new Reloader pod
|
||||
- Verify pod annotations: `prometheus.io/scrape: "true"`
|
||||
- Check Prometheus targets: `http://localhost:9091/targets`
|
||||
|
||||
#### Metrics don't match expected values
|
||||
- Verify test ran to completion (check logs)
|
||||
- Ensure Prometheus scraped final metrics (18s wait after test)
|
||||
- Check for pod restarts during test (metrics reset on restart - handled by `increase()`)
|
||||
|
||||
#### High latency in new implementation
|
||||
- Check Reloader pod resource limits
|
||||
- Look for API server throttling in logs
|
||||
- Compare `reconcile_total` - fewer reconciles with higher duration may be normal
|
||||
|
||||
#### REST client errors are non-zero
|
||||
- **Common causes:**
|
||||
- Optional CRD schemes registered but CRDs not installed (e.g., Argo Rollouts, OpenShift DeploymentConfig)
|
||||
- API server rate limiting under high load
|
||||
- RBAC permissions missing for certain resource types
|
||||
- **Argo Rollouts errors:** If you see ~4 errors per test, ensure `--enable-argo-rollouts=false` if not using Argo Rollouts
|
||||
- **OpenShift errors:** Similarly, ensure DeploymentConfig support is disabled on non-OpenShift clusters
|
||||
|
||||
#### REST client requests much higher in new implementation
|
||||
- Check if caching is working correctly
|
||||
- Look for excessive re-queuing in controller logs
|
||||
- Compare `reconcile_total` - more reconciles naturally means more API calls
|
||||
|
||||
## Report Format
|
||||
|
||||
The report generator produces a comparison table with units and expected value indicators:
|
||||
|
||||
```
|
||||
================================================================================
|
||||
RELOADER A/B COMPARISON REPORT
|
||||
================================================================================
|
||||
|
||||
Scenario: S2
|
||||
Generated: 2026-01-03 14:30:00
|
||||
Status: PASS
|
||||
Summary: All metrics within acceptable thresholds
|
||||
|
||||
Test: S2: Fan-out test - 1 CM update triggers 50 deployment reloads
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
METRIC COMPARISONS
|
||||
--------------------------------------------------------------------------------
|
||||
(Old✓/New✓ = meets expected value within 15%)
|
||||
|
||||
Metric Old New Expected Old✓ New✓ Status
|
||||
------ --- --- -------- ---- ---- ------
|
||||
reconcile_total 50.00 25.00 - - - pass
|
||||
reconcile_duration_p50 (s) 0.01 0.05 - - - pass
|
||||
reconcile_duration_p95 (s) 0.02 0.15 - - - pass
|
||||
action_total 50.00 50.00 50 ✓ ✓ pass
|
||||
action_latency_p50 (s) 0.05 0.03 - - - pass
|
||||
action_latency_p95 (s) 0.12 0.08 - - - pass
|
||||
errors_total 0.00 0.00 - - - pass
|
||||
reload_executed_total 50.00 50.00 50 ✓ ✓ pass
|
||||
workloads_scanned_total 50.00 50.00 50 ✓ ✓ pass
|
||||
workloads_matched_total 50.00 50.00 50 ✓ ✓ pass
|
||||
rest_client_requests_total 850 720 - - - pass
|
||||
rest_client_requests_get 500 420 - - - pass
|
||||
rest_client_requests_patch 300 250 - - - pass
|
||||
rest_client_requests_errors 0 0 - - - pass
|
||||
```
|
||||
|
||||
Reports are saved to `results/<scenario>/report.txt` after each test.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
test/loadtest/
|
||||
├── cmd/
|
||||
│ └── loadtest/ # Unified CLI (run + report)
|
||||
│ └── main.go
|
||||
├── internal/
|
||||
│ ├── cluster/ # Kind cluster management
|
||||
│ │ └── kind.go
|
||||
│ ├── prometheus/ # Prometheus deployment & querying
|
||||
│ │ └── prometheus.go
|
||||
│ ├── reloader/ # Reloader deployment
|
||||
│ │ └── deploy.go
|
||||
│ └── scenarios/ # Test scenario implementations
|
||||
│ └── scenarios.go
|
||||
├── manifests/
|
||||
│ └── prometheus.yaml # Prometheus deployment manifest
|
||||
├── results/ # Generated after tests
|
||||
│ └── <scenario>/
|
||||
│ ├── old/ # Old version data
|
||||
│ │ ├── *.json # Prometheus metric snapshots
|
||||
│ │ └── reloader.log # Reloader pod logs
|
||||
│ ├── new/ # New version data
|
||||
│ │ ├── *.json # Prometheus metric snapshots
|
||||
│ │ └── reloader.log # Reloader pod logs
|
||||
│ ├── expected.json # Expected values from test
|
||||
│ └── report.txt # Comparison report
|
||||
├── go.mod
|
||||
├── go.sum
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Building Local Images for Testing
|
||||
|
||||
If you want to test local code changes:
|
||||
|
||||
```bash
|
||||
# Build the new Reloader image from current source
|
||||
docker build -t localhost/reloader:dev -f Dockerfile .
|
||||
|
||||
# Build from a different branch/commit
|
||||
git checkout feature-branch
|
||||
docker build -t localhost/reloader:feature -f Dockerfile .
|
||||
|
||||
# Then run comparison
|
||||
./loadtest run \
|
||||
--old-image=stakater/reloader:v1.0.0 \
|
||||
--new-image=localhost/reloader:feature
|
||||
```
|
||||
|
||||
## Interpreting Results
|
||||
|
||||
### PASS
|
||||
All metrics are within acceptable thresholds. The new implementation is comparable or better than the old one.
|
||||
|
||||
### FAIL
|
||||
One or more metrics exceeded thresholds. Review the specific metrics:
|
||||
- **Latency degradation**: p95/p99 latencies are significantly higher
|
||||
- **Missed reloads**: `reload_executed_total` differs significantly
|
||||
- **Errors increased**: `errors_total` is higher in new version
|
||||
|
||||
### Investigation
|
||||
|
||||
If tests fail, check:
|
||||
1. Pod logs: `kubectl logs -n reloader-new deployment/reloader` (or check `results/<scenario>/new/reloader.log`)
|
||||
2. Resource usage: `kubectl top pods -n reloader-new`
|
||||
3. Events: `kubectl get events -n reloader-test`
|
||||
|
||||
## Parallel Execution
|
||||
|
||||
The `--parallelism` option enables running scenarios on multiple kind clusters simultaneously, significantly reducing total test time.
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Multiple Clusters**: Creates N kind clusters named `reloader-loadtest-0`, `reloader-loadtest-1`, etc.
|
||||
2. **Separate Prometheus**: Each cluster gets its own Prometheus instance with a unique port (9091, 9092, etc.)
|
||||
3. **Worker Pool**: Scenarios are distributed to workers via a channel, with each worker running on its own cluster
|
||||
4. **Independent Execution**: Each scenario runs in complete isolation with no resource contention
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Run 4 scenarios at a time (creates 4 clusters)
|
||||
./loadtest run --new-image=my-image:tag --parallelism=4
|
||||
|
||||
# Run all 13 scenarios in parallel (creates 13 clusters)
|
||||
./loadtest run --new-image=my-image:tag --parallelism=13 --scenario=all
|
||||
```
|
||||
|
||||
### Resource Requirements
|
||||
|
||||
Parallel execution requires significant system resources:
|
||||
|
||||
| Parallelism | Clusters | Est. Memory | Est. CPU |
|
||||
|-------------|----------|-------------|----------|
|
||||
| 1 (default) | 1 | ~4GB | 2-4 cores |
|
||||
| 4 | 4 | ~16GB | 8-16 cores |
|
||||
| 13 | 13 | ~52GB | 26-52 cores |
|
||||
|
||||
### Notes
|
||||
|
||||
- The `--skip-cluster` option is not supported with parallelism > 1
|
||||
- Each worker loads images independently, so initial setup takes longer
|
||||
- All results are written to the same `--results-dir` with per-scenario subdirectories
|
||||
- If a cluster setup fails, remaining workers continue with available clusters
|
||||
- Parallelism automatically reduces to match scenario count if set higher
|
||||
|
||||
## CI Integration
|
||||
|
||||
### GitHub Actions
|
||||
|
||||
Load tests can be triggered on pull requests by commenting `/loadtest`:
|
||||
|
||||
```
|
||||
/loadtest
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Build a container image from the PR branch
|
||||
2. Run all load test scenarios against it
|
||||
3. Post results as a PR comment
|
||||
4. Upload detailed results as artifacts
|
||||
|
||||
### Make Target
|
||||
|
||||
Run load tests locally or in CI:
|
||||
|
||||
```bash
|
||||
# From repository root
|
||||
make loadtest
|
||||
```
|
||||
|
||||
This builds the container image and runs all scenarios with a 60-second duration.
|
||||
1582
test/loadtest/cmd/loadtest/main.go
Normal file
1582
test/loadtest/cmd/loadtest/main.go
Normal file
File diff suppressed because it is too large
Load Diff
50
test/loadtest/go.mod
Normal file
50
test/loadtest/go.mod
Normal file
@@ -0,0 +1,50 @@
|
||||
module github.com/stakater/Reloader/test/loadtest
|
||||
|
||||
go 1.22.0
|
||||
|
||||
require (
|
||||
k8s.io/api v0.31.0
|
||||
k8s.io/apimachinery v0.31.0
|
||||
k8s.io/client-go v0.31.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.19.6 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.22.4 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/gnostic-models v0.6.8 // indirect
|
||||
github.com/google/go-cmp v0.6.0 // indirect
|
||||
github.com/google/gofuzz v1.2.0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/imdario/mergo v0.3.6 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
github.com/x448/float16 v0.8.4 // indirect
|
||||
golang.org/x/net v0.26.0 // indirect
|
||||
golang.org/x/oauth2 v0.21.0 // indirect
|
||||
golang.org/x/sys v0.21.0 // indirect
|
||||
golang.org/x/term v0.21.0 // indirect
|
||||
golang.org/x/text v0.16.0 // indirect
|
||||
golang.org/x/time v0.3.0 // indirect
|
||||
google.golang.org/protobuf v1.34.2 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/klog/v2 v2.130.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
|
||||
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
)
|
||||
154
test/loadtest/go.sum
Normal file
154
test/loadtest/go.sum
Normal file
@@ -0,0 +1,154 @@
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
|
||||
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
|
||||
github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
|
||||
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
|
||||
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
|
||||
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
|
||||
github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
|
||||
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
|
||||
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
|
||||
github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU=
|
||||
github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
|
||||
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
|
||||
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
|
||||
github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
|
||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
|
||||
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM=
|
||||
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28=
|
||||
github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
|
||||
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
|
||||
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
|
||||
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA=
|
||||
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
|
||||
github.com/onsi/gomega v1.19.0 h1:4ieX6qQjPP/BfC3mpsAtIGGlxTWPeA3Inl/7DtXw1tw=
|
||||
github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
|
||||
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
|
||||
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
|
||||
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
|
||||
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
|
||||
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
|
||||
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
|
||||
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA=
|
||||
golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
|
||||
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
|
||||
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
|
||||
golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
|
||||
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
|
||||
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
|
||||
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
|
||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo=
|
||||
k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE=
|
||||
k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc=
|
||||
k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
|
||||
k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8=
|
||||
k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU=
|
||||
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
|
||||
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
|
||||
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
|
||||
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
|
||||
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
|
||||
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
|
||||
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
|
||||
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
|
||||
313
test/loadtest/internal/cluster/kind.go
Normal file
313
test/loadtest/internal/cluster/kind.go
Normal file
@@ -0,0 +1,313 @@
|
||||
// Package cluster provides kind cluster management functionality.
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Config holds configuration for kind cluster operations.
|
||||
type Config struct {
|
||||
Name string
|
||||
ContainerRuntime string // "docker" or "podman"
|
||||
PortOffset int // Offset for host port mappings (for parallel clusters)
|
||||
}
|
||||
|
||||
// Manager handles kind cluster operations.
|
||||
type Manager struct {
|
||||
cfg Config
|
||||
}
|
||||
|
||||
// NewManager creates a new cluster manager.
|
||||
func NewManager(cfg Config) *Manager {
|
||||
return &Manager{cfg: cfg}
|
||||
}
|
||||
|
||||
// DetectContainerRuntime finds available container runtime.
|
||||
func DetectContainerRuntime() (string, error) {
|
||||
if _, err := exec.LookPath("podman"); err == nil {
|
||||
return "podman", nil
|
||||
}
|
||||
if _, err := exec.LookPath("docker"); err == nil {
|
||||
return "docker", nil
|
||||
}
|
||||
return "", fmt.Errorf("neither docker nor podman found in PATH")
|
||||
}
|
||||
|
||||
// Exists checks if the cluster already exists.
|
||||
func (m *Manager) Exists() bool {
|
||||
cmd := exec.Command("kind", "get", "clusters")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.TrimSpace(line) == m.cfg.Name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Delete deletes the kind cluster.
|
||||
func (m *Manager) Delete(ctx context.Context) error {
|
||||
cmd := exec.CommandContext(ctx, "kind", "delete", "cluster", "--name", m.cfg.Name)
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// Create creates a new kind cluster with optimized settings.
|
||||
func (m *Manager) Create(ctx context.Context) error {
|
||||
if m.cfg.ContainerRuntime == "podman" {
|
||||
os.Setenv("KIND_EXPERIMENTAL_PROVIDER", "podman")
|
||||
}
|
||||
|
||||
if m.Exists() {
|
||||
fmt.Printf("Cluster %s already exists, deleting...\n", m.cfg.Name)
|
||||
if err := m.Delete(ctx); err != nil {
|
||||
return fmt.Errorf("deleting existing cluster: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate unique ports based on offset (for parallel clusters)
|
||||
httpPort := 8080 + m.cfg.PortOffset
|
||||
httpsPort := 8443 + m.cfg.PortOffset
|
||||
|
||||
config := fmt.Sprintf(`kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
networking:
|
||||
podSubnet: "10.244.0.0/16"
|
||||
serviceSubnet: "10.96.0.0/16"
|
||||
nodes:
|
||||
- role: control-plane
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: InitConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
node-labels: "ingress-ready=true"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
- |
|
||||
kind: ClusterConfiguration
|
||||
apiServer:
|
||||
extraArgs:
|
||||
max-requests-inflight: "800"
|
||||
max-mutating-requests-inflight: "400"
|
||||
watch-cache-sizes: "configmaps#1000,secrets#1000,pods#1000"
|
||||
controllerManager:
|
||||
extraArgs:
|
||||
kube-api-qps: "200"
|
||||
kube-api-burst: "200"
|
||||
scheduler:
|
||||
extraArgs:
|
||||
kube-api-qps: "200"
|
||||
kube-api-burst: "200"
|
||||
extraPortMappings:
|
||||
- containerPort: 80
|
||||
hostPort: %d
|
||||
protocol: TCP
|
||||
- containerPort: 443
|
||||
hostPort: %d
|
||||
protocol: TCP
|
||||
- role: worker
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: JoinConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
max-pods: "250"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
- role: worker
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: JoinConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
max-pods: "250"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
- role: worker
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: JoinConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
max-pods: "250"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
- role: worker
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: JoinConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
max-pods: "250"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
- role: worker
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: JoinConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
max-pods: "250"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
- role: worker
|
||||
kubeadmConfigPatches:
|
||||
- |
|
||||
kind: JoinConfiguration
|
||||
nodeRegistration:
|
||||
kubeletExtraArgs:
|
||||
max-pods: "250"
|
||||
kube-api-qps: "50"
|
||||
kube-api-burst: "100"
|
||||
serialize-image-pulls: "false"
|
||||
event-qps: "50"
|
||||
event-burst: "100"
|
||||
`, httpPort, httpsPort)
|
||||
cmd := exec.CommandContext(ctx, "kind", "create", "cluster", "--name", m.cfg.Name, "--config=-")
|
||||
cmd.Stdin = strings.NewReader(config)
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// GetKubeconfig returns the kubeconfig for the cluster.
|
||||
func (m *Manager) GetKubeconfig() (string, error) {
|
||||
cmd := exec.Command("kind", "get", "kubeconfig", "--name", m.cfg.Name)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("getting kubeconfig: %w", err)
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// Context returns the kubectl context name for this cluster.
|
||||
func (m *Manager) Context() string {
|
||||
return "kind-" + m.cfg.Name
|
||||
}
|
||||
|
||||
// Name returns the cluster name.
|
||||
func (m *Manager) Name() string {
|
||||
return m.cfg.Name
|
||||
}
|
||||
|
||||
// LoadImage loads a container image into the kind cluster.
|
||||
func (m *Manager) LoadImage(ctx context.Context, image string) error {
|
||||
// First check if image exists locally
|
||||
if !m.imageExistsLocally(image) {
|
||||
fmt.Printf(" Image not found locally, pulling: %s\n", image)
|
||||
pullCmd := exec.CommandContext(ctx, m.cfg.ContainerRuntime, "pull", image)
|
||||
pullCmd.Stdout = os.Stdout
|
||||
pullCmd.Stderr = os.Stderr
|
||||
if err := pullCmd.Run(); err != nil {
|
||||
return fmt.Errorf("pulling image %s: %w", image, err)
|
||||
}
|
||||
} else {
|
||||
fmt.Printf(" Image found locally: %s\n", image)
|
||||
}
|
||||
|
||||
fmt.Printf(" Copying image to kind cluster...\n")
|
||||
|
||||
if m.cfg.ContainerRuntime == "podman" {
|
||||
// For podman, save to archive and load
|
||||
tmpFile := fmt.Sprintf("/tmp/kind-image-%d.tar", time.Now().UnixNano())
|
||||
defer os.Remove(tmpFile)
|
||||
|
||||
saveCmd := exec.CommandContext(ctx, m.cfg.ContainerRuntime, "save", image, "-o", tmpFile)
|
||||
if err := saveCmd.Run(); err != nil {
|
||||
return fmt.Errorf("saving image %s: %w", image, err)
|
||||
}
|
||||
|
||||
loadCmd := exec.CommandContext(ctx, "kind", "load", "image-archive", tmpFile, "--name", m.cfg.Name)
|
||||
loadCmd.Stdout = os.Stdout
|
||||
loadCmd.Stderr = os.Stderr
|
||||
if err := loadCmd.Run(); err != nil {
|
||||
return fmt.Errorf("loading image archive: %w", err)
|
||||
}
|
||||
} else {
|
||||
loadCmd := exec.CommandContext(ctx, "kind", "load", "docker-image", image, "--name", m.cfg.Name)
|
||||
loadCmd.Stdout = os.Stdout
|
||||
loadCmd.Stderr = os.Stderr
|
||||
if err := loadCmd.Run(); err != nil {
|
||||
return fmt.Errorf("loading image %s: %w", image, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// imageExistsLocally checks if an image exists in the local container runtime.
|
||||
func (m *Manager) imageExistsLocally(image string) bool {
|
||||
// Try "image exists" command (works for podman)
|
||||
cmd := exec.Command(m.cfg.ContainerRuntime, "image", "exists", image)
|
||||
if err := cmd.Run(); err == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
// Try "image inspect" (works for both docker and podman)
|
||||
cmd = exec.Command(m.cfg.ContainerRuntime, "image", "inspect", image)
|
||||
if err := cmd.Run(); err == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
// Try listing images and grep
|
||||
cmd = exec.Command(m.cfg.ContainerRuntime, "images", "--format", "{{.Repository}}:{{.Tag}}")
|
||||
out, err := cmd.Output()
|
||||
if err == nil {
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.TrimSpace(line) == image {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// PullImage pulls an image using the container runtime.
|
||||
func (m *Manager) PullImage(ctx context.Context, image string) error {
|
||||
cmd := exec.CommandContext(ctx, m.cfg.ContainerRuntime, "pull", image)
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// ExecKubectl runs a kubectl command against the cluster.
|
||||
func (m *Manager) ExecKubectl(ctx context.Context, args ...string) ([]byte, error) {
|
||||
cmd := exec.CommandContext(ctx, "kubectl", args...)
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("%w: %s", err, stderr.String())
|
||||
}
|
||||
return stdout.Bytes(), nil
|
||||
}
|
||||
452
test/loadtest/internal/prometheus/prometheus.go
Normal file
452
test/loadtest/internal/prometheus/prometheus.go
Normal file
@@ -0,0 +1,452 @@
|
||||
// Package prometheus provides Prometheus deployment and querying functionality.
|
||||
package prometheus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Manager handles Prometheus operations.
|
||||
type Manager struct {
|
||||
manifestPath string
|
||||
portForward *exec.Cmd
|
||||
localPort int
|
||||
kubeContext string // Optional: use specific kubeconfig context
|
||||
}
|
||||
|
||||
// NewManager creates a new Prometheus manager.
|
||||
func NewManager(manifestPath string) *Manager {
|
||||
return &Manager{
|
||||
manifestPath: manifestPath,
|
||||
localPort: 9091, // Use 9091 to avoid conflicts
|
||||
}
|
||||
}
|
||||
|
||||
// NewManagerWithPort creates a Prometheus manager with a custom port.
|
||||
func NewManagerWithPort(manifestPath string, port int, kubeContext string) *Manager {
|
||||
return &Manager{
|
||||
manifestPath: manifestPath,
|
||||
localPort: port,
|
||||
kubeContext: kubeContext,
|
||||
}
|
||||
}
|
||||
|
||||
// kubectl returns kubectl args with optional context
|
||||
func (m *Manager) kubectl(args ...string) []string {
|
||||
if m.kubeContext != "" {
|
||||
return append([]string{"--context", m.kubeContext}, args...)
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
// Deploy deploys Prometheus to the cluster.
|
||||
func (m *Manager) Deploy(ctx context.Context) error {
|
||||
// Create namespace
|
||||
cmd := exec.CommandContext(ctx, "kubectl", m.kubectl("create", "namespace", "monitoring", "--dry-run=client", "-o", "yaml")...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return fmt.Errorf("generating namespace yaml: %w", err)
|
||||
}
|
||||
|
||||
applyCmd := exec.CommandContext(ctx, "kubectl", m.kubectl("apply", "-f", "-")...)
|
||||
applyCmd.Stdin = strings.NewReader(string(out))
|
||||
if err := applyCmd.Run(); err != nil {
|
||||
return fmt.Errorf("applying namespace: %w", err)
|
||||
}
|
||||
|
||||
// Apply Prometheus manifest
|
||||
applyCmd = exec.CommandContext(ctx, "kubectl", m.kubectl("apply", "-f", m.manifestPath)...)
|
||||
applyCmd.Stdout = os.Stdout
|
||||
applyCmd.Stderr = os.Stderr
|
||||
if err := applyCmd.Run(); err != nil {
|
||||
return fmt.Errorf("applying prometheus manifest: %w", err)
|
||||
}
|
||||
|
||||
// Wait for Prometheus to be ready
|
||||
fmt.Println("Waiting for Prometheus to be ready...")
|
||||
waitCmd := exec.CommandContext(ctx, "kubectl", m.kubectl("wait", "--for=condition=ready", "pod",
|
||||
"-l", "app=prometheus", "-n", "monitoring", "--timeout=120s")...)
|
||||
waitCmd.Stdout = os.Stdout
|
||||
waitCmd.Stderr = os.Stderr
|
||||
if err := waitCmd.Run(); err != nil {
|
||||
return fmt.Errorf("waiting for prometheus: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StartPortForward starts port-forwarding to Prometheus.
|
||||
func (m *Manager) StartPortForward(ctx context.Context) error {
|
||||
m.StopPortForward()
|
||||
|
||||
// Start port-forward
|
||||
m.portForward = exec.CommandContext(ctx, "kubectl", m.kubectl("port-forward",
|
||||
"-n", "monitoring", "svc/prometheus", fmt.Sprintf("%d:9090", m.localPort))...)
|
||||
|
||||
if err := m.portForward.Start(); err != nil {
|
||||
return fmt.Errorf("starting port-forward: %w", err)
|
||||
}
|
||||
|
||||
// Wait for port-forward to be ready
|
||||
for i := 0; i < 30; i++ {
|
||||
time.Sleep(time.Second)
|
||||
if m.isAccessible() {
|
||||
fmt.Printf("Prometheus accessible at http://localhost:%d\n", m.localPort)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("prometheus port-forward not ready after 30s")
|
||||
}
|
||||
|
||||
// StopPortForward stops the port-forward process.
|
||||
func (m *Manager) StopPortForward() {
|
||||
if m.portForward != nil && m.portForward.Process != nil {
|
||||
m.portForward.Process.Kill()
|
||||
m.portForward = nil
|
||||
}
|
||||
// Also kill any lingering port-forwards
|
||||
exec.Command("pkill", "-f", fmt.Sprintf("kubectl port-forward.*prometheus.*%d", m.localPort)).Run()
|
||||
}
|
||||
|
||||
// Reset restarts Prometheus to clear all metrics.
|
||||
func (m *Manager) Reset(ctx context.Context) error {
|
||||
m.StopPortForward()
|
||||
|
||||
// Delete Prometheus pod to reset metrics
|
||||
cmd := exec.CommandContext(ctx, "kubectl", m.kubectl("delete", "pod", "-n", "monitoring",
|
||||
"-l", "app=prometheus", "--grace-period=0", "--force")...)
|
||||
cmd.Run() // Ignore errors
|
||||
|
||||
// Wait for new pod
|
||||
fmt.Println("Waiting for Prometheus to restart...")
|
||||
waitCmd := exec.CommandContext(ctx, "kubectl", m.kubectl("wait", "--for=condition=ready", "pod",
|
||||
"-l", "app=prometheus", "-n", "monitoring", "--timeout=120s")...)
|
||||
if err := waitCmd.Run(); err != nil {
|
||||
return fmt.Errorf("waiting for prometheus restart: %w", err)
|
||||
}
|
||||
|
||||
// Restart port-forward
|
||||
if err := m.StartPortForward(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for scraping to initialize
|
||||
fmt.Println("Waiting 5s for Prometheus to initialize scraping...")
|
||||
time.Sleep(5 * time.Second)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) isAccessible() bool {
|
||||
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", m.localPort), 2*time.Second)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
conn.Close()
|
||||
|
||||
// Also try HTTP
|
||||
resp, err := http.Get(fmt.Sprintf("http://localhost:%d/api/v1/status/config", m.localPort))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
resp.Body.Close()
|
||||
return resp.StatusCode == 200
|
||||
}
|
||||
|
||||
// URL returns the local Prometheus URL.
|
||||
func (m *Manager) URL() string {
|
||||
return fmt.Sprintf("http://localhost:%d", m.localPort)
|
||||
}
|
||||
|
||||
// WaitForTarget waits for a specific job to be scraped by Prometheus.
|
||||
func (m *Manager) WaitForTarget(ctx context.Context, job string, timeout time.Duration) error {
|
||||
fmt.Printf("Waiting for Prometheus to discover and scrape job '%s'...\n", job)
|
||||
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
if m.isTargetHealthy(job) {
|
||||
fmt.Printf("Prometheus is scraping job '%s'\n", job)
|
||||
return nil
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
// Print debug info on timeout
|
||||
m.printTargetStatus(job)
|
||||
return fmt.Errorf("timeout waiting for Prometheus to scrape job '%s'", job)
|
||||
}
|
||||
|
||||
// isTargetHealthy checks if a job has at least one healthy target.
|
||||
func (m *Manager) isTargetHealthy(job string) bool {
|
||||
resp, err := http.Get(fmt.Sprintf("%s/api/v1/targets", m.URL()))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
ActiveTargets []struct {
|
||||
Labels map[string]string `json:"labels"`
|
||||
Health string `json:"health"`
|
||||
} `json:"activeTargets"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, target := range result.Data.ActiveTargets {
|
||||
if target.Labels["job"] == job && target.Health == "up" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// printTargetStatus prints debug info about targets.
|
||||
func (m *Manager) printTargetStatus(job string) {
|
||||
resp, err := http.Get(fmt.Sprintf("%s/api/v1/targets", m.URL()))
|
||||
if err != nil {
|
||||
fmt.Printf("Failed to get targets: %v\n", err)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
var result struct {
|
||||
Data struct {
|
||||
ActiveTargets []struct {
|
||||
Labels map[string]string `json:"labels"`
|
||||
Health string `json:"health"`
|
||||
LastError string `json:"lastError"`
|
||||
ScrapeURL string `json:"scrapeUrl"`
|
||||
} `json:"activeTargets"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
fmt.Printf("Failed to parse targets: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("Prometheus targets for job '%s':\n", job)
|
||||
found := false
|
||||
for _, target := range result.Data.ActiveTargets {
|
||||
if target.Labels["job"] == job {
|
||||
found = true
|
||||
fmt.Printf(" - %s: health=%s, lastError=%s\n",
|
||||
target.ScrapeURL, target.Health, target.LastError)
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
fmt.Printf(" No targets found for job '%s'\n", job)
|
||||
fmt.Printf(" Available jobs: ")
|
||||
jobs := make(map[string]bool)
|
||||
for _, target := range result.Data.ActiveTargets {
|
||||
jobs[target.Labels["job"]] = true
|
||||
}
|
||||
for j := range jobs {
|
||||
fmt.Printf("%s ", j)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
// HasMetrics checks if the specified job has any metrics available.
|
||||
func (m *Manager) HasMetrics(ctx context.Context, job string) bool {
|
||||
query := fmt.Sprintf(`up{job="%s"}`, job)
|
||||
result, err := m.Query(ctx, query)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return len(result.Data.Result) > 0 && result.Data.Result[0].Value[1] == "1"
|
||||
}
|
||||
|
||||
// QueryResponse represents a Prometheus query response.
|
||||
type QueryResponse struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
ResultType string `json:"resultType"`
|
||||
Result []struct {
|
||||
Metric map[string]string `json:"metric"`
|
||||
Value []interface{} `json:"value"`
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
// Query executes a PromQL query and returns the response.
|
||||
func (m *Manager) Query(ctx context.Context, query string) (*QueryResponse, error) {
|
||||
u := fmt.Sprintf("%s/api/v1/query?query=%s", m.URL(), url.QueryEscape(query))
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", u, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("querying prometheus: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading response: %w", err)
|
||||
}
|
||||
|
||||
var result QueryResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, fmt.Errorf("parsing response: %w", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// CollectMetrics collects all metrics for a scenario and writes to output directory.
|
||||
func (m *Manager) CollectMetrics(ctx context.Context, job, outputDir, scenario string) error {
|
||||
if err := os.MkdirAll(outputDir, 0755); err != nil {
|
||||
return fmt.Errorf("creating output directory: %w", err)
|
||||
}
|
||||
|
||||
timeRange := "10m"
|
||||
|
||||
// For S6 (restart scenario), use increase() to handle counter resets
|
||||
useIncrease := scenario == "S6"
|
||||
|
||||
// Counter metrics
|
||||
counterMetrics := []string{
|
||||
"reloader_reconcile_total",
|
||||
"reloader_action_total",
|
||||
"reloader_skipped_total",
|
||||
"reloader_errors_total",
|
||||
"reloader_events_received_total",
|
||||
"reloader_workloads_scanned_total",
|
||||
"reloader_workloads_matched_total",
|
||||
"reloader_reload_executed_total",
|
||||
}
|
||||
|
||||
for _, metric := range counterMetrics {
|
||||
var query string
|
||||
if useIncrease {
|
||||
query = fmt.Sprintf(`sum(increase(%s{job="%s"}[%s])) by (success, reason)`, metric, job, timeRange)
|
||||
} else {
|
||||
query = fmt.Sprintf(`sum(%s{job="%s"}) by (success, reason)`, metric, job)
|
||||
}
|
||||
|
||||
if err := m.queryAndSave(ctx, query, filepath.Join(outputDir, metric+".json")); err != nil {
|
||||
fmt.Printf("Warning: failed to collect %s: %v\n", metric, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Histogram percentiles
|
||||
histogramMetrics := []struct {
|
||||
name string
|
||||
prefix string
|
||||
}{
|
||||
{"reloader_reconcile_duration_seconds", "reconcile"},
|
||||
{"reloader_action_latency_seconds", "action"},
|
||||
}
|
||||
|
||||
for _, hm := range histogramMetrics {
|
||||
for _, pct := range []int{50, 95, 99} {
|
||||
quantile := float64(pct) / 100
|
||||
query := fmt.Sprintf(`histogram_quantile(%v, sum(rate(%s_bucket{job="%s"}[%s])) by (le))`,
|
||||
quantile, hm.name, job, timeRange)
|
||||
outFile := filepath.Join(outputDir, fmt.Sprintf("%s_p%d.json", hm.prefix, pct))
|
||||
if err := m.queryAndSave(ctx, query, outFile); err != nil {
|
||||
fmt.Printf("Warning: failed to collect %s p%d: %v\n", hm.name, pct, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// REST client metrics
|
||||
restQueries := map[string]string{
|
||||
"rest_client_requests_total.json": fmt.Sprintf(`sum(rest_client_requests_total{job="%s"})`, job),
|
||||
"rest_client_requests_get.json": fmt.Sprintf(`sum(rest_client_requests_total{job="%s",method="GET"})`, job),
|
||||
"rest_client_requests_patch.json": fmt.Sprintf(`sum(rest_client_requests_total{job="%s",method="PATCH"})`, job),
|
||||
"rest_client_requests_put.json": fmt.Sprintf(`sum(rest_client_requests_total{job="%s",method="PUT"})`, job),
|
||||
"rest_client_requests_errors.json": fmt.Sprintf(`sum(rest_client_requests_total{job="%s",code=~"[45].."}) or vector(0)`, job),
|
||||
}
|
||||
|
||||
for filename, query := range restQueries {
|
||||
if err := m.queryAndSave(ctx, query, filepath.Join(outputDir, filename)); err != nil {
|
||||
fmt.Printf("Warning: failed to collect %s: %v\n", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Resource consumption metrics (memory, CPU, goroutines)
|
||||
resourceQueries := map[string]string{
|
||||
// Memory metrics (in bytes)
|
||||
"memory_rss_bytes_avg.json": fmt.Sprintf(`avg_over_time(process_resident_memory_bytes{job="%s"}[%s])`, job, timeRange),
|
||||
"memory_rss_bytes_max.json": fmt.Sprintf(`max_over_time(process_resident_memory_bytes{job="%s"}[%s])`, job, timeRange),
|
||||
"memory_rss_bytes_cur.json": fmt.Sprintf(`process_resident_memory_bytes{job="%s"}`, job),
|
||||
|
||||
// Heap memory (Go runtime)
|
||||
"memory_heap_bytes_avg.json": fmt.Sprintf(`avg_over_time(go_memstats_heap_alloc_bytes{job="%s"}[%s])`, job, timeRange),
|
||||
"memory_heap_bytes_max.json": fmt.Sprintf(`max_over_time(go_memstats_heap_alloc_bytes{job="%s"}[%s])`, job, timeRange),
|
||||
|
||||
// CPU metrics (rate of CPU seconds used)
|
||||
"cpu_usage_cores_avg.json": fmt.Sprintf(`rate(process_cpu_seconds_total{job="%s"}[%s])`, job, timeRange),
|
||||
"cpu_usage_cores_max.json": fmt.Sprintf(`max_over_time(rate(process_cpu_seconds_total{job="%s"}[1m])[%s:1m])`, job, timeRange),
|
||||
|
||||
// Goroutines (concurrency indicator)
|
||||
"goroutines_avg.json": fmt.Sprintf(`avg_over_time(go_goroutines{job="%s"}[%s])`, job, timeRange),
|
||||
"goroutines_max.json": fmt.Sprintf(`max_over_time(go_goroutines{job="%s"}[%s])`, job, timeRange),
|
||||
"goroutines_cur.json": fmt.Sprintf(`go_goroutines{job="%s"}`, job),
|
||||
|
||||
// GC metrics
|
||||
"gc_duration_seconds_p99.json": fmt.Sprintf(`histogram_quantile(0.99, sum(rate(go_gc_duration_seconds_bucket{job="%s"}[%s])) by (le))`, job, timeRange),
|
||||
|
||||
// Threads
|
||||
"threads_cur.json": fmt.Sprintf(`go_threads{job="%s"}`, job),
|
||||
}
|
||||
|
||||
for filename, query := range resourceQueries {
|
||||
if err := m.queryAndSave(ctx, query, filepath.Join(outputDir, filename)); err != nil {
|
||||
fmt.Printf("Warning: failed to collect %s: %v\n", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) queryAndSave(ctx context.Context, query, outputPath string) error {
|
||||
result, err := m.Query(ctx, query)
|
||||
if err != nil {
|
||||
// Write empty result on error
|
||||
emptyResult := `{"status":"success","data":{"resultType":"vector","result":[]}}`
|
||||
return os.WriteFile(outputPath, []byte(emptyResult), 0644)
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return os.WriteFile(outputPath, data, 0644)
|
||||
}
|
||||
2092
test/loadtest/internal/scenarios/scenarios.go
Normal file
2092
test/loadtest/internal/scenarios/scenarios.go
Normal file
File diff suppressed because it is too large
Load Diff
181
test/loadtest/manifests/prometheus.yaml
Normal file
181
test/loadtest/manifests/prometheus.yaml
Normal file
@@ -0,0 +1,181 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 2s
|
||||
evaluation_interval: 2s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'reloader-old'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- reloader-old
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
action: replace
|
||||
target_label: kubernetes_namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
action: replace
|
||||
target_label: kubernetes_pod_name
|
||||
|
||||
- job_name: 'reloader-new'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- reloader-new
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
action: replace
|
||||
target_label: kubernetes_namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
action: replace
|
||||
target_label: kubernetes_pod_name
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/proxy
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["get"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: quay.io/prometheus/prometheus:v2.47.0
|
||||
args:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --web.console.libraries=/usr/share/prometheus/console_libraries
|
||||
- --web.console.templates=/usr/share/prometheus/consoles
|
||||
- --web.enable-lifecycle
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/prometheus
|
||||
- name: data
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 512Mi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9090
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9090
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: prometheus
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
type: NodePort
|
||||
Reference in New Issue
Block a user