diff --git a/src/Rcpp-adapters/FunctionsAdapter.cpp b/src/Rcpp-adapters/FunctionsAdapter.cpp index 02d9f0af..5c980ed4 100644 --- a/src/Rcpp-adapters/FunctionsAdapter.cpp +++ b/src/Rcpp-adapters/FunctionsAdapter.cpp @@ -360,7 +360,11 @@ namespace exageostat::adapters { aConfigurations.SetDenseTileSize(aDenseTileSize); aConfigurations.SetLowTileSize(aLowTileSize); aConfigurations.SetDimension(validator::Validator::CheckDimensionValue(aDimension)); - aConfigurations.SetProblemSize(data->GetLocations()->GetSize()); + if (aConfigurations.GetIsFisher()) { + aConfigurations.SetProblemSize(train_data_size); + } else { + aConfigurations.SetProblemSize(train_data_size + test_data_size); + } aConfigurations.SetEstimatedTheta(aEstimatedTheta); // Temporarily release ownership to pass to the function. diff --git a/tests/cpp-tests/Rcpp-adapters/CMakeLists.txt b/tests/cpp-tests/Rcpp-adapters/CMakeLists.txt index 9a4a59a5..c80a2cf3 100644 --- a/tests/cpp-tests/Rcpp-adapters/CMakeLists.txt +++ b/tests/cpp-tests/Rcpp-adapters/CMakeLists.txt @@ -10,6 +10,7 @@ set(EXAGEOSTAT_TESTFILES ${CMAKE_CURRENT_SOURCE_DIR}/TestAllRFunctions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/TestRPredictProblemSize.cpp ${EXAGEOSTAT_TESTFILES} PARENT_SCOPE ) diff --git a/tests/cpp-tests/Rcpp-adapters/TestRPredictProblemSize.cpp b/tests/cpp-tests/Rcpp-adapters/TestRPredictProblemSize.cpp new file mode 100644 index 00000000..d6d94a21 --- /dev/null +++ b/tests/cpp-tests/Rcpp-adapters/TestRPredictProblemSize.cpp @@ -0,0 +1,85 @@ +// Copyright (c) 2017-2024 King Abdullah University of Science and Technology, +// All rights reserved. +// ExaGeoStat is a software package, provided by King Abdullah University of Science and Technology (KAUST). + +/** + * @file TestRPredictProblemSize.cpp + * @brief Regression test for the problem-size configured by the R prediction adapter. + * @details PredictionSetupHelper (used by predict_data / R_ExaGeoStatPredictData) configured the problem + * size from the training locations only: + * + * aConfigurations.SetProblemSize(data->GetLocations()->GetSize()); // == number of TRAINING points + * + * The prediction pipeline, however, works over the combined set of observed (training) and missing (test) + * locations, and sizes its internal Z descriptor from the configured problem size. When the number of test + * (missing) points is comparable to or larger than the number of training points, the problem size is too + * small and the prediction fails / produces incorrect values. + * + * The existing R-adapter test never triggers this because it uses 16 training points and only 2 test + * points (test << train). This test deliberately uses MORE test points than training points and checks the + * fundamental kriging interpolation property: predicting at an observed location must return the observed + * value (for an exact, nugget-free model). It uses the non-nugget univariate_matern_stationary kernel so it + * is independent of any other kernel-specific behaviour. + * @version 1.1.0 +**/ + +#include + +#include + +#include + +using namespace std; + +using namespace exageostat::adapters; +using namespace exageostat::common; + +void TEST_R_PREDICT_PROBLEM_SIZE() { + + const string kernel = "univariate_matern_stationary"; + const string distance_matrix = "eg"; + const string dimension = "2D"; + const int dts = 3; + const int lts = 0; + const vector estimated_theta = {1.0, 0.1, 0.5}; + + // 9 training points with distinct measurements. + vector train_x = {0.10, 0.22, 0.31, 0.43, 0.55, 0.62, 0.74, 0.81, 0.93}; + vector train_y = {0.12, 0.27, 0.39, 0.41, 0.58, 0.63, 0.71, 0.86, 0.95}; + vector train_z = {-1.0, 2.0, 0.5, -0.5, 1.5, -1.5, 0.8, -0.8, 1.1}; + vector> train_data = {train_x, train_y, train_z}; + + auto hardware = ExaGeoStatHardware("exact", 1, 0); + + SECTION("Prediction with more test points than training points") { + + // 12 test points = the 9 training locations + 3 extra. N_test (12) > N_train (9), so a problem size + // computed from the training set alone is too small for the prediction descriptors. + vector test_x = train_x; + vector test_y = train_y; + test_x.push_back(0.18); test_y.push_back(0.20); + test_x.push_back(0.50); test_y.push_back(0.50); + test_x.push_back(0.88); test_y.push_back(0.90); + vector> test_data = {test_x, test_y}; + + REQUIRE(test_x.size() > train_x.size()); + + auto result = R_ExaGeoStatPredictData(kernel, distance_matrix, estimated_theta, dts, lts, dimension, + train_data, test_data); + + // One prediction per test point. + REQUIRE(result.size() == test_x.size()); + + // Interpolation property: the prediction at each training location (the first 9 test points, which + // coincide with the training locations) must recover the observed value for an exact, nugget-free + // model. With the under-sized problem size the prediction is wrong; with the correct size it holds. + for (size_t i = 0; i < train_x.size(); i++) { + INFO("training location " << i << " at (" << train_x[i] << ", " << train_y[i] << ")"); + REQUIRE(result[i] == Catch::Approx(train_z[i]).margin(1e-4)); + } + } +} + +TEST_CASE("R predict_data problem-size regression") { + TEST_R_PREDICT_PROBLEM_SIZE(); +}