Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Task Manager] Add config switch around logging at different levels based on the state #102804

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion docs/settings/task-manager-settings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@ Task Manager runs background tasks by polling for work on an interval. You can
| The maximum number of tasks that this Kibana instance will run simultaneously. Defaults to 10.
Starting in 8.0, it will not be possible to set the value greater than 100.

| `xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds`
| `xpack.task_manager.`
`monitored_stats_health_verbose_log.enabled`
| This flag will enable automatic warn and error logging if task manager self detects a performance issue, such as the time between when a task is scheduled to execute and when it actually executes. Defaults to false.

| `xpack.task_manager.`
`monitored_stats_health_verbose_log.`
`warn_delayed_task_start_in_seconds`
| The amount of seconds we allow a task to delay before printing a warning server log. Defaults to 60.
|===

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,8 @@ kibana_vars=(
xpack.task_manager.monitored_aggregated_stats_refresh_rate
xpack.task_manager.monitored_stats_required_freshness
xpack.task_manager.monitored_stats_running_average_window
xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_stats_health_verbose_log.enabled
xpack.task_manager.monitored_stats_health_verbose_log.warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_task_execution_thresholds
xpack.task_manager.poll_interval
xpack.task_manager.request_capacity
Expand Down
15 changes: 12 additions & 3 deletions x-pack/plugins/task_manager/server/config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -67,9 +70,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -103,9 +109,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {
"alerting:always-fires": Object {
Expand Down
9 changes: 6 additions & 3 deletions x-pack/plugins/task_manager/server/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,12 @@ export const configSchema = schema.object(
defaultValue: {},
}),
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
monitored_stats_warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
monitored_stats_health_verbose_log: schema.object({
enabled: schema.boolean({ defaultValue: false }),
/* The amount of seconds we allow a task to delay before printing a warning server log */
warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
}),
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ describe('managed configuration', () => {
version_conflict_threshold: 80,
max_poll_inactivity_cycles: 10,
monitored_aggregated_stats_refresh_rate: 60000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 4000,
monitored_stats_running_average_window: 50,
request_capacity: 1000,
Expand Down
191 changes: 181 additions & 10 deletions x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { configSchema, TaskManagerConfig } from '../config';
import { HealthStatus } from '../monitoring';
import { TaskPersistence } from '../monitoring/task_run_statistics';
import { MonitoredHealth } from '../routes/health';
import { logHealthMetrics } from './log_health_metrics';
import { logHealthMetrics, resetLastLogLevel } from './log_health_metrics';
import { Logger } from '../../../../../src/core/server';

jest.mock('./calculate_health_status', () => ({
Expand All @@ -20,12 +20,110 @@ jest.mock('./calculate_health_status', () => ({
describe('logHealthMetrics', () => {
afterEach(() => {
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// Reset the last state by running through this as OK
// (calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
resetLastLogLevel();
(calculateHealthStatus as jest.Mock<HealthStatus>).mockReset();
});

it('should log a warning message to enable verbose logging when the status goes from OK to Warning/Error', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from OK to Warning
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
logHealthMetrics(health, logger, config);
// We must change from OK to Error
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
logHealthMetrics(health, logger, config);

expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
expect((logger as jest.Mocked<Logger>).warn.mock.calls[1][0] as string).toBe(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
});

it('should not log a warning message to enable verbose logging when the status goes from Warning to OK', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from Warning to OK
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn).not.toHaveBeenCalled();
});

it('should not log a warning message to enable verbose logging when the status goes from Error to OK', () => {
// console.log('start', getLastLogLevel());
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from Error to OK
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn).not.toHaveBeenCalled();
});

it('should log as debug if status is OK', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();

logHealthMetrics(health, logger, config);

const firstDebug = JSON.parse(
(logger as jest.Mocked<Logger>).debug.mock.calls[0][0].replace('Latest Monitored Stats: ', '')
);
expect(firstDebug).toMatchObject(health);
});

it('should log as debug if status is OK even if not enabled', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();

Expand All @@ -40,7 +138,10 @@ describe('logHealthMetrics', () => {
it('should log as warn if status is Warn', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
Expand All @@ -62,7 +163,10 @@ describe('logHealthMetrics', () => {
it('should log as error if status is Error', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
Expand All @@ -79,15 +183,26 @@ describe('logHealthMetrics', () => {
expect(logMessage).toMatchObject(health);
});

it('should log as warn if drift exceeds the threshold', () => {
it('should log as warn if drift exceeds the threshold for a single alert type', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
runtime: {
value: {
drift_by_type: {
'taskType:test': {
p99: 60000,
},
'taskType:test2': {
p99: 60000 - 1,
},
},
drift: {
p99: 60000,
},
Expand All @@ -99,7 +214,50 @@ describe('logHealthMetrics', () => {
logHealthMetrics(health, logger, config);

expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected delay task start of 60s (which exceeds configured value of 60s)`
`Detected delay task start of 60s for task(s) \"taskType:test\" (which exceeds configured value of 60s)`
);

const secondMessage = JSON.parse(
((logger as jest.Mocked<Logger>).warn.mock.calls[1][0] as string).replace(
`Latest Monitored Stats: `,
''
)
);
expect(secondMessage).toMatchObject(health);
});

it('should log as warn if drift exceeds the threshold for multiple alert types', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
runtime: {
value: {
drift_by_type: {
'taskType:test': {
p99: 60000,
},
'taskType:test2': {
p99: 60000,
},
},
drift: {
p99: 60000,
},
},
},
},
});

logHealthMetrics(health, logger, config);

expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected delay task start of 60s for task(s) \"taskType:test, taskType:test2\" (which exceeds configured value of 60s)`
);

const secondMessage = JSON.parse(
Expand All @@ -114,7 +272,10 @@ describe('logHealthMetrics', () => {
it('should log as debug if there are no stats', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = {
id: '1',
Expand All @@ -135,7 +296,10 @@ describe('logHealthMetrics', () => {
it('should ignore capacity estimation status', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
Expand Down Expand Up @@ -213,7 +377,14 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth {
p95: 2500,
p99: 3000,
},
drift_by_type: {},
drift_by_type: {
'taskType:test': {
p50: 1000,
p90: 2000,
p95: 2500,
p99: 3000,
},
},
load: {
p50: 1000,
p90: 2000,
Expand Down
Loading