sleeper/example/full/table.properties at main · patchwork01/sleeper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208

## The following table properties relate to the definition of data inside a table.

# A unique name identifying this table.
sleeper.table.name=example-table

# Fully qualified class of a custom iterator to use when iterating over the values in this table.
# Defaults to nothing.
sleeper.table.iterator.class.name=sleeper.core.iterator.impl.AgeOffIterator

# Iterator configuration. An iterator will be initialised with the following configuration.
sleeper.table.iterator.config=b,3600000


## The following table properties relate to partition splitting.

# Splits file which will be used to initialise the partitions for this table. Defaults to nothing and
# the table will be created with a single root partition.
sleeper.table.splits.file=example/full/splits.txt

# Flag to set if you have base64 encoded the split points (only used for string key types and defaults
# to false).
sleeper.table.splits.base64.encoded=false

# Partitions in this table with more than the following number of records in will be split.
sleeper.table.partition.splitting.threshold=1000000000


## The following table properties relate to the storage of data inside a table.

# The size of the row group in the Parquet files - defaults to the value in the instance properties.
sleeper.table.rowgroup.size=8388608

# The size of the page in the Parquet files - defaults to the value in the instance properties.
sleeper.table.page.size=131072

# Whether dictionary encoding should be used for row key columns in the Parquet files.
sleeper.table.parquet.dictionary.encoding.rowkey.fields=false

# Whether dictionary encoding should be used for sort key columns in the Parquet files.
sleeper.table.parquet.dictionary.encoding.sortkey.fields=false

# Whether dictionary encoding should be used for value columns in the Parquet files.
sleeper.table.parquet.dictionary.encoding.value.fields=false

# Used to set parquet.columnindex.truncate.length, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# The length in bytes to truncate binary values in a column index.
sleeper.table.parquet.columnindex.truncate.length=128

# Used to set parquet.statistics.truncate.length, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# The length in bytes to truncate the min/max binary values in row groups.
sleeper.table.parquet.statistics.truncate.length=2147483647

# The S3 readahead range - defaults to the value in the instance properties.
sleeper.table.fs.s3a.readahead.range=64K

# The compression codec to use for this table. Defaults to the value in the instance properties.
# Valid values are: [uncompressed, snappy, gzip, lzo, brotli, lz4, zstd]
sleeper.table.compression.codec=zstd

# A file will not be deleted until this number of minutes have passed after it has been marked as
# ready for garbage collection. The reason for not deleting files immediately after they have been
# marked as ready for garbage collection is that they may still be in use by queries. Defaults to the
# value set in the instance properties.
sleeper.table.gc.delay.minutes=15


## The following table properties relate to compactions.

# The name of the class that defines how compaction jobs should be created.
# This should implement sleeper.compaction.strategy.CompactionStrategy. Defaults to the strategy used
# by the whole instance (set in the instance properties).
sleeper.table.compaction.strategy.class=sleeper.compaction.strategy.impl.SizeRatioCompactionStrategy

# The number of files to read in a compaction job. Note that the state store must support atomic
# updates for this many files.
# The DynamoDBStateStore must be able to atomically apply 2 updates to create the output files for a
# splitting compaction, and 2 updates for each input file to mark them as ready for garbage
# collection. There's a limit of 100 atomic updates, which equates to 48 files in a compaction.
# See also: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/transaction-apis.html
# (NB This does not apply to splitting jobs which will run even if there is only 1 file.)
sleeper.table.compaction.files.batch.size=11

# Used by the SizeRatioCompactionStrategy to decide if a group of files should be compacted.
# If the file sizes are s_1, ..., s_n then the files are compacted if s_1 + ... + s_{n-1} >= ratio *
# s_n.
sleeper.table.compaction.strategy.sizeratio.ratio=3

# Used by the SizeRatioCompactionStrategy to control the maximum number of jobs that can be running
# concurrently per partition.
sleeper.table.compaction.strategy.sizeratio.max.concurrent.jobs.per.partition=2147483647


## The following table properties relate to storing and retrieving metadata for tables.

# The name of the class used for the metadata store. The default is S3StateStore. An alternative
# option is the DynamoDBStateStore.
sleeper.table.statestore.classname=sleeper.statestore.s3.S3StateStore

# This specifies whether queries and scans against DynamoDB tables used in the DynamoDB state store
# are strongly consistent.
sleeper.table.metadata.dynamo.consistent.reads=false


## The following table properties relate to bulk import, i.e. ingesting data using Spark jobs running
## on EMR or EKS.

# (Non-persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR
# cluster. Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk
# import using EMR - Instance types section in docs/05-ingest.md
sleeper.table.bulk.import.emr.instance.architecture=x86_64

# (Non-persistent EMR mode only) The EC2 x86_64 instance types and weights to be used for the master
# node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.table.bulk.import.emr.master.x86.instance.types=m6i.xlarge

# (Non-persistent EMR mode only) The EC2 x86_64 instance types and weights to be used for the executor
# nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.table.bulk.import.emr.executor.x86.instance.types=m6i.4xlarge

# (Non-persistent EMR mode only) The EC2 ARM64 instance types and weights to be used for the master
# node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.table.bulk.import.emr.master.arm.instance.types=m6g.xlarge

# (Non-persistent EMR mode only) The EC2 ARM64 instance types and weights to be used for the executor
# nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.table.bulk.import.emr.executor.arm.instance.types=m6g.4xlarge

# (Non-persistent EMR mode only) The purchasing option to be used for the executor nodes of the EMR
# cluster.
# Valid values are ON_DEMAND or SPOT.
sleeper.table.bulk.import.emr.executor.market.type=SPOT

# (Non-persistent EMR mode only) The initial number of capacity units to provision as EC2 instances
# for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This value overrides the default value in the instance properties. It can be overridden by a value
# in the bulk import job specification.
sleeper.table.bulk.import.emr.executor.initial.capacity=2

# (Non-persistent EMR mode only) The maximum number of capacity units to provision as EC2 instances
# for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This value overrides the default value in the instance properties. It can be overridden by a value
# in the bulk import job specification.
sleeper.table.bulk.import.emr.executor.max.capacity=10

# (Non-persistent EMR mode only) The EMR release label to be used when creating an EMR cluster for
# bulk importing data using Spark running on EMR.
# This value overrides the default value in the instance properties. It can be overridden by a value
# in the bulk import job specification.
sleeper.table.bulk.import.emr.release.label=emr-6.13.0

# Specifies the minimum number of leaf partitions that are needed to run a bulk import job. If this
# minimum has not been reached, bulk import jobs will refuse to start
sleeper.table.bulk.import.min.leaf.partitions=64


## The following table properties relate to the ingest batcher.

# Specifies the minimum total file size required for an ingest job to be batched and sent. An ingest
# job will be created if the batcher runs while this much data is waiting, and the minimum number of
# files is also met.
sleeper.table.ingest.batcher.job.min.size=1G

# Specifies the maximum total file size for a job in the ingest batcher. If more data is waiting than
# this, it will be split into multiple jobs. If a single file exceeds this, it will still be ingested
# in its own job. It's also possible some data may be left for a future run of the batcher if some
# recent files overflow the size of a job but aren't enough to create a job on their own.
sleeper.table.ingest.batcher.job.max.size=5G

# Specifies the minimum number of files for a job in the ingest batcher. An ingest job will be created
# if the batcher runs while this many files are waiting, and the minimum size of files is also met.
sleeper.table.ingest.batcher.job.min.files=1

# Specifies the maximum number of files for a job in the ingest batcher. If more files are waiting
# than this, they will be split into multiple jobs. It's possible some data may be left for a future
# run of the batcher if some recent files overflow the size of a job but aren't enough to create a job
# on their own.
sleeper.table.ingest.batcher.job.max.files=100

# Specifies the maximum time in seconds that a file can be held in the batcher before it will be
# included in an ingest job. When any file has been waiting for longer than this, a job will be
# created with all the currently held files, even if other criteria for a batch are not met.
sleeper.table.ingest.batcher.file.max.age.seconds=300

# Specifies the target ingest queue where batched jobs are sent.
# Valid values are: [standard_ingest, bulk_import_emr, bulk_import_persistent_emr, bulk_import_eks,
# bulk_import_emr_serverless]
sleeper.table.ingest.batcher.ingest.mode=standard_ingest

# The time in minutes that the tracking information is retained for a file before the records of its
# ingest are deleted (eg. which ingest job it was assigned to, the time this occurred, the size of the
# file).
# The expiry time is fixed when a file is saved to the store, so changing this will only affect new
# data.
# Defaults to 1 week.
sleeper.table.ingest.batcher.file.tracking.ttl.minutes=10080