Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions conf/cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,21 @@ commitlog_disk_access_mode: legacy
# - direct: use direct I/O for compaction reads, bypassing the OS page cache
# compaction_read_disk_access_mode: auto

# Set the disk access mode for writing compressed SSTables during background operations
# (compaction, streaming, cleanup, repair, etc.). The allowed values are:
# - standard: use buffered I/O (default)
# - direct: use direct I/O, bypassing the OS page cache
# Note: Only applies to compressed tables. Uncompressed tables always use buffered I/O.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't really need to prefix with Note:. It's not really information bearing.

# Note: Memtable flushes always use buffered I/O regardless of this setting, as flushed
# data benefits from page cache for subsequent reads.
# background_write_disk_access_mode: standard

# Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
# frequency against per-flush blocking latency on the compaction thread.
# Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
# + one block when chunk_length exceeds this value.
# direct_write_buffer_size: 1MiB

# Compression to apply to SSTables as they flush for compressed tables.
# Note that tables without compression enabled do not respect this flag.
#
Expand Down
15 changes: 15 additions & 0 deletions conf/cassandra_latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,21 @@ commitlog_disk_access_mode: auto
# - direct: use direct I/O for compaction reads, bypassing the OS page cache
# compaction_read_disk_access_mode: auto

# Set the disk access mode for writing compressed SSTables during background operations
# (compaction, streaming, cleanup, repair, etc.). The allowed values are:
# - standard: use buffered I/O (default)
# - direct: use direct I/O, bypassing the OS page cache
# Note: Only applies to compressed tables. Uncompressed tables always use buffered I/O.
# Note: Memtable flushes always use buffered I/O regardless of this setting, as flushed
# data benefits from page cache for subsequent reads.
background_write_disk_access_mode: direct

# Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
# frequency against per-flush blocking latency on the compaction thread.
# Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
# + one block when chunk_length exceeds this value.
# direct_write_buffer_size: 1MiB

# Compression to apply to SSTables as they flush for compressed tables.
# Note that tables without compression enabled do not respect this flag.
#
Expand Down
15 changes: 13 additions & 2 deletions src/java/org/apache/cassandra/config/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,18 @@ public MemtableOptions()

public DataStorageSpec.IntKibibytesBound compressed_read_ahead_buffer_size = new DataStorageSpec.IntKibibytesBound("256KiB");

// Direct IO for background SSTable writes (compaction, streaming, cleanup, etc.)
// When 'direct' is set, background writes bypass the OS page cache using O_DIRECT.
// Memtable flushes always use buffered I/O regardless of this setting.
// Default is 'standard' (buffered I/O) - users must opt-in to Direct IO
public DiskAccessMode background_write_disk_access_mode = DiskAccessMode.standard;

// Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
// frequency against per-flush blocking latency on the compaction thread.
// Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
// + one block when chunk_length exceeds this value.
public DataStorageSpec.IntKibibytesBound direct_write_buffer_size = new DataStorageSpec.IntKibibytesBound("1MiB");

// fraction of free disk space available for compaction after min free space is subtracted
public volatile Double max_space_usable_for_compactions_in_percentage = .95;

Expand Down Expand Up @@ -1275,8 +1287,7 @@ public enum DiskAccessMode
legacy,

/**
* Direct-I/O is enabled for commitlog disk only.
* When adding support for direct IO, update {@link org.apache.cassandra.service.StartupChecks#checkKernelBug1057843}
* When adding support for Direct I/O, update {@link org.apache.cassandra.service.StartupChecks#checkKernelBug1057843}
*/
direct
}
Expand Down
84 changes: 84 additions & 0 deletions src/java/org/apache/cassandra/config/DatabaseDescriptor.java
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ public class DatabaseDescriptor

private static DiskAccessMode compactionReadDiskAccessMode;

private static DiskAccessMode backgroundWriteDiskAccessMode;

private static AbstractCryptoProvider cryptoProvider;
private static IAuthenticator authenticator;
private static IAuthorizer authorizer;
Expand Down Expand Up @@ -897,6 +899,10 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
if (conf.hints_directory.equals(conf.saved_caches_directory))
throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false);

initializeBackgroundWriteDiskAccessMode();
if (backgroundWriteDiskAccessMode != conf.background_write_disk_access_mode)
logger.info("background_write_disk_access_mode resolved to: {}", backgroundWriteDiskAccessMode);

if (conf.memtable_flush_writers == 0)
{
conf.memtable_flush_writers = conf.data_file_directories.length == 1 ? 2 : 1;
Expand Down Expand Up @@ -3406,6 +3412,84 @@ public static void initializeCommitLogDiskAccessMode()
commitLogWriteDiskAccessMode = accessModeDirectIoPair.left;
}

public static DiskAccessMode getBackgroundWriteDiskAccessMode()
{
return backgroundWriteDiskAccessMode;
}

@VisibleForTesting
public static void setBackgroundWriteDiskAccessMode(DiskAccessMode diskAccessMode)
{
backgroundWriteDiskAccessMode = diskAccessMode;
conf.background_write_disk_access_mode = diskAccessMode;
}

public static DataStorageSpec.IntKibibytesBound getDirectWriteBufferSize()
{
return conf.direct_write_buffer_size;
}

@VisibleForTesting
public static void initializeBackgroundWriteDiskAccessMode()
{
DiskAccessMode providedMode = conf.background_write_disk_access_mode;

if (providedMode == DiskAccessMode.auto)
{
providedMode = DiskAccessMode.standard;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Auto seems pretty unconditional here so the check above to log that it changed doesn't seem to mean much? I guess it clarifies that AUTO always resolves to STANDARD. Not sure when it would change in the future.

It's fine just something I noted.

}

if (providedMode == DiskAccessMode.direct)
{
// DataStorageSpec already rejects negatives at parse time; zero is the remaining
// nonsense value. The writer's Math.max would silently coerce it to minRequiredSize,
// which masks a likely operator mistake — fail fast instead.
if (conf.direct_write_buffer_size.toBytes() <= 0)
throw new ConfigurationException("direct_write_buffer_size must be > 0 when background_write_disk_access_mode is 'direct'. " +
"Got: " + conf.direct_write_buffer_size, false);

if (!toolInitialized)
{
List<String> unsupportedLocations = new ArrayList<>();

for (String dataDir : conf.data_file_directories)
{
try
{
File dataDirFile = new File(dataDir);
PathUtils.createDirectoriesIfNotExists(dataDirFile.toPath());

if (!FileUtils.isDirectIOSupported(dataDirFile))
{
unsupportedLocations.add(dataDir);
}
}
catch (RuntimeException e)
{
logger.warn("Unable to determine Direct IO support for data directory {}: {}", dataDir, e.getMessage());
unsupportedLocations.add(dataDir + " (check failed: " + e.getMessage() + ")");
}
}

if (!unsupportedLocations.isEmpty())
{
throw new ConfigurationException(
String.format("background_write_disk_access_mode is set to 'direct', but the following data directories " +
"do not support Direct I/O: %s. Either change background_write_disk_access_mode to 'standard' " +
"in cassandra.yaml, or ensure all data directories are on filesystems that support Direct I/O.",
unsupportedLocations), false);
}
}
}
else if (providedMode != DiskAccessMode.standard)
{
throw new ConfigurationException("Unsupported disk access mode for background_write_disk_access_mode " +
"(options: standard/direct/auto): " + providedMode, false);
}

backgroundWriteDiskAccessMode = providedMode;
}

public static String getSavedCachesLocation()
{
return conf.saved_caches_directory;
Expand Down
49 changes: 49 additions & 0 deletions src/java/org/apache/cassandra/io/DirectIoSupport.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.io;

/**
* Classifies an operation's eligibility for a direct-IO (O_DIRECT) data path, encoding both
* the answer and the rationale class. Consumers maintain their own per-operation classification
* and apply this alongside their own gates (e.g. compression, configuration mode);
* {@link #SUPPORTED} is necessary but not sufficient.
*/
public enum DirectIoSupport
{
/**
* Eligible for the direct-IO data path.
* */
SUPPORTED,

/**
* The direct-IO path is mechanically incompatible with this operation. Removing this
* exclusion requires code changes, not policy.
*/
UNSUPPORTED_CORRECTNESS,

/**
* Direct IO would work but is deliberately disabled for performance or cache-residency
* reasons. Removing this exclusion requires re-evaluating the policy, not code changes.
*/
UNSUPPORTED_POLICY;

public boolean isSupported()
{
return this == SUPPORTED;
}
}
Loading