|
1 | | -from apscheduler.schedulers.asyncio import AsyncIOScheduler |
2 | | -from apscheduler.triggers.interval import IntervalTrigger |
3 | | - |
4 | | -from dstack._internal.server import settings |
5 | | -from dstack._internal.server.background.tasks.process_compute_groups import process_compute_groups |
6 | | -from dstack._internal.server.background.tasks.process_events import delete_events |
7 | | -from dstack._internal.server.background.tasks.process_fleets import process_fleets |
8 | | -from dstack._internal.server.background.tasks.process_gateways import ( |
9 | | - process_gateways, |
10 | | - process_gateways_connections, |
11 | | -) |
12 | | -from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes |
13 | | -from dstack._internal.server.background.tasks.process_instances import ( |
14 | | - delete_instance_health_checks, |
15 | | - process_instances, |
16 | | -) |
17 | | -from dstack._internal.server.background.tasks.process_metrics import ( |
18 | | - collect_metrics, |
19 | | - delete_metrics, |
20 | | -) |
21 | | -from dstack._internal.server.background.tasks.process_placement_groups import ( |
22 | | - process_placement_groups, |
23 | | -) |
24 | | -from dstack._internal.server.background.tasks.process_probes import process_probes |
25 | | -from dstack._internal.server.background.tasks.process_prometheus_metrics import ( |
26 | | - collect_prometheus_metrics, |
27 | | - delete_prometheus_metrics, |
28 | | -) |
29 | | -from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs |
30 | | -from dstack._internal.server.background.tasks.process_runs import process_runs |
31 | | -from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs |
32 | | -from dstack._internal.server.background.tasks.process_terminating_jobs import ( |
33 | | - process_terminating_jobs, |
34 | | -) |
35 | | -from dstack._internal.server.background.tasks.process_volumes import process_submitted_volumes |
36 | | - |
37 | | -_scheduler = AsyncIOScheduler() |
38 | | - |
39 | | - |
40 | | -def get_scheduler() -> AsyncIOScheduler: |
41 | | - return _scheduler |
42 | | - |
43 | | - |
44 | | -def start_background_tasks() -> AsyncIOScheduler: |
45 | | - # Background processing is implemented via in-memory locks on SQLite |
46 | | - # and SELECT FOR UPDATE on Postgres. Locks may be held for a long time. |
47 | | - # This is currently the main bottleneck for scaling dstack processing |
48 | | - # as processing more resources requires more DB connections. |
49 | | - # TODO: Make background processing efficient by committing locks to DB |
50 | | - # and processing outside of DB transactions. |
51 | | - # |
52 | | - # Now we just try to process as many resources as possible without exhausting DB connections. |
53 | | - # |
54 | | - # Quick tasks can process multiple resources per transaction. |
55 | | - # Potentially long tasks process one resource per transaction |
56 | | - # to avoid holding locks for all the resources if one is slow to process. |
57 | | - # Still, the next batch won't be processed unless all resources are processed, |
58 | | - # so larger batches do not increase processing rate linearly. |
59 | | - # |
60 | | - # The interval, batch_size, and max_instances determine background tasks processing rates. |
61 | | - # By default, one server replica can handle: |
62 | | - # |
63 | | - # * 150 active jobs with 2 minutes processing latency |
64 | | - # * 150 active runs with 2 minutes processing latency |
65 | | - # * 150 active instances with 2 minutes processing latency |
66 | | - # |
67 | | - # These latency numbers do not account for provisioning time, |
68 | | - # so it may be slower if a backend is slow to provision. |
69 | | - # |
70 | | - # Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica. |
71 | | - # They also need to increase max db connections on the client side and db side. |
72 | | - # |
73 | | - # In-memory locking via locksets does not guarantee |
74 | | - # that the first waiting for the lock will acquire it. |
75 | | - # The jitter is needed to give all tasks a chance to acquire locks. |
76 | | - |
77 | | - _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1)) |
78 | | - _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1) |
79 | | - _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1) |
80 | | - _scheduler.add_job(delete_events, IntervalTrigger(minutes=7), max_instances=1) |
81 | | - if settings.ENABLE_PROMETHEUS_METRICS: |
82 | | - _scheduler.add_job( |
83 | | - collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1 |
84 | | - ) |
85 | | - _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) |
86 | | - _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) |
87 | | - _scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5) |
88 | | - _scheduler.add_job( |
89 | | - process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5 |
90 | | - ) |
91 | | - _scheduler.add_job( |
92 | | - process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 |
93 | | - ) |
94 | | - _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5)) |
95 | | - _scheduler.add_job( |
96 | | - process_fleets, |
97 | | - IntervalTrigger(seconds=10, jitter=2), |
98 | | - max_instances=1, |
99 | | - ) |
100 | | - _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1) |
101 | | - for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR): |
102 | | - # Add multiple copies of tasks if requested. |
103 | | - # max_instances=1 for additional copies to avoid running too many tasks. |
104 | | - # Move other tasks here when they need per-replica scaling. |
105 | | - _scheduler.add_job( |
106 | | - process_submitted_jobs, |
107 | | - IntervalTrigger(seconds=4, jitter=2), |
108 | | - kwargs={"batch_size": 5}, |
109 | | - max_instances=4 if replica == 0 else 1, |
110 | | - ) |
111 | | - _scheduler.add_job( |
112 | | - process_running_jobs, |
113 | | - IntervalTrigger(seconds=4, jitter=2), |
114 | | - kwargs={"batch_size": 5}, |
115 | | - max_instances=2 if replica == 0 else 1, |
116 | | - ) |
117 | | - _scheduler.add_job( |
118 | | - process_terminating_jobs, |
119 | | - IntervalTrigger(seconds=4, jitter=2), |
120 | | - kwargs={"batch_size": 5}, |
121 | | - max_instances=2 if replica == 0 else 1, |
122 | | - ) |
123 | | - _scheduler.add_job( |
124 | | - process_runs, |
125 | | - IntervalTrigger(seconds=2, jitter=1), |
126 | | - kwargs={"batch_size": 5}, |
127 | | - max_instances=2 if replica == 0 else 1, |
128 | | - ) |
129 | | - _scheduler.add_job( |
130 | | - process_instances, |
131 | | - IntervalTrigger(seconds=4, jitter=2), |
132 | | - kwargs={"batch_size": 5}, |
133 | | - max_instances=2 if replica == 0 else 1, |
134 | | - ) |
135 | | - _scheduler.add_job( |
136 | | - process_compute_groups, |
137 | | - IntervalTrigger(seconds=15, jitter=2), |
138 | | - kwargs={"batch_size": 1}, |
139 | | - max_instances=2 if replica == 0 else 1, |
140 | | - ) |
141 | | - _scheduler.start() |
142 | | - return _scheduler |
0 commit comments