Skip to content

Tabsyn Sampling Fails When Dataset Contains Only Categorical Features #29

@kirilzilla

Description

@kirilzilla

Description:

I encountered an error during the sampling phase using the tabsyn method. The issue seems to be related to missing numerical features during preprocessing.

Steps to Reproduce:

  1. Use the following insurance.json configuration:
    It has 25 category columns and no numerical columns, the first column is the target column 0 or 1.

    {
        "name": "insurance",
        "task_type": "binclass",
        "header": "infer",
        "column_names": [
            "GoodStudent", "Age", "SocioEcon", "RiskAversion", "VehicleYear", 
            "RuggedAuto", "MakeModel", "DrivQuality", "Mileage", "Antilock", 
            "DrivingSkill", "SeniorTrain", "ThisCarCost", "Theft", "CarValue", 
            "HomeBase", "AntiTheft", "PropCost", "OtherCarCost", "OtherCar", 
            "MedCost", "Cushioning", "Airbag", "ILiCost", "DrivHist"
        ],
        "num_col_idx": [],
        "cat_col_idx": [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
            11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
            21, 22, 23, 24
        ],
        "target_col_idx": [0],
        "file_type": "csv",
        "data_path": "data/insurance/insurance.csv",
        "test_path": null
    }
    
  2. Preprocess and train the model successfully using VAE and tabsyn.

Preprocessing output shows a numerical vector, although there are no numerical features.
Numerical (18000, 0)
Categorical (18000, 24)

  • I think this might be a mistake. It should be NO numerical columns and 25 categorical columns instead auf 24.
  1. Run the sampling command:
    python main.py --dataname insurance --method tabsyn --mode sample

During the sampling step, the following error is thrown:

ValueError: Found array with 0 feature(s) (shape=(18000, 0)) while a minimum of 1 is required by QuantileTransformer.

This is the processed JSON in tabsyn/data/insurance/info.json

{
 "name": "insurance",
 "task_type": "binclass",
 "header": "infer",
 "column_names": [
     "GoodStudent",
     "Age",
     "SocioEcon",
     "RiskAversion",
     "VehicleYear",
     "RuggedAuto",
     "MakeModel",
     "DrivQuality",
     "Mileage",
     "Antilock",
     "DrivingSkill",
     "SeniorTrain",
     "ThisCarCost",
     "Theft",
     "CarValue",
     "HomeBase",
     "AntiTheft",
     "PropCost",
     "OtherCarCost",
     "OtherCar",
     "MedCost",
     "Cushioning",
     "Airbag",
     "ILiCost",
     "DrivHist"
 ],
 "num_col_idx": [],
 "cat_col_idx": [
     1,
     2,
     3,
     4,
     5,
     6,
     7,
     8,
     9,
     10,
     11,
     12,
     13,
     14,
     15,
     16,
     17,
     18,
     19,
     20,
     21,
     22,
     23,
     24
 ],
 "target_col_idx": [
     0
 ],
 "file_type": "csv",
 "data_path": "data/insurance/insurance.csv",
 "test_path": null,
 "column_info": {
     "1": {},
     "type": "categorical",
     "categorizes": [
         0,
         1
     ],
     "2": {},
     "3": {},
     "4": {},
     "5": {},
     "6": {},
     "7": {},
     "8": {},
     "9": {},
     "10": {},
     "11": {},
     "12": {},
     "13": {},
     "14": {},
     "15": {},
     "16": {},
     "17": {},
     "18": {},
     "19": {},
     "20": {},
     "21": {},
     "22": {},
     "23": {},
     "24": {},
     "0": {}
 },
 "train_num": 18000,
 "test_num": 2000,
 "idx_mapping": {
     "0": 24,
     "1": 0,
     "2": 1,
     "3": 2,
     "4": 3,
     "5": 4,
     "6": 5,
     "7": 6,
     "8": 7,
     "9": 8,
     "10": 9,
     "11": 10,
     "12": 11,
     "13": 12,
     "14": 13,
     "15": 14,
     "16": 15,
     "17": 16,
     "18": 17,
     "19": 18,
     "20": 19,
     "21": 20,
     "22": 21,
     "23": 22,
     "24": 23
 },
 "inverse_idx_mapping": {
     "24": 0,
     "0": 1,
     "1": 2,
     "2": 3,
     "3": 4,
     "4": 5,
     "5": 6,
     "6": 7,
     "7": 8,
     "8": 9,
     "9": 10,
     "10": 11,
     "11": 12,
     "12": 13,
     "13": 14,
     "14": 15,
     "15": 16,
     "16": 17,
     "17": 18,
     "18": 19,
     "19": 20,
     "20": 21,
     "21": 22,
     "22": 23,
     "23": 24
 },
 "idx_name_mapping": {
     "0": "GoodStudent",
     "1": "Age",
     "2": "SocioEcon",
     "3": "RiskAversion",
     "4": "VehicleYear",
     "5": "RuggedAuto",
     "6": "MakeModel",
     "7": "DrivQuality",
     "8": "Mileage",
     "9": "Antilock",
     "10": "DrivingSkill",
     "11": "SeniorTrain",
     "12": "ThisCarCost",
     "13": "Theft",
     "14": "CarValue",
     "15": "HomeBase",
     "16": "AntiTheft",
     "17": "PropCost",
     "18": "OtherCarCost",
     "19": "OtherCar",
     "20": "MedCost",
     "21": "Cushioning",
     "22": "Airbag",
     "23": "ILiCost",
     "24": "DrivHist"
 },
 "metadata": {
     "columns": {
         "1": {
             "sdtype": "categorical"
         },
         "2": {
             "sdtype": "categorical"
         },
         "3": {
             "sdtype": "categorical"
         },
         "4": {
             "sdtype": "categorical"
         },
         "5": {
             "sdtype": "categorical"
         },
         "6": {
             "sdtype": "categorical"
         },
         "7": {
             "sdtype": "categorical"
         },
         "8": {
             "sdtype": "categorical"
         },
         "9": {
             "sdtype": "categorical"
         },
         "10": {
             "sdtype": "categorical"
         },
         "11": {
             "sdtype": "categorical"
         },
         "12": {
             "sdtype": "categorical"
         },
         "13": {
             "sdtype": "categorical"
         },
         "14": {
             "sdtype": "categorical"
         },
         "15": {
             "sdtype": "categorical"
         },
         "16": {
             "sdtype": "categorical"
         },
         "17": {
             "sdtype": "categorical"
         },
         "18": {
             "sdtype": "categorical"
         },
         "19": {
             "sdtype": "categorical"
         },
         "20": {
             "sdtype": "categorical"
         },
         "21": {
             "sdtype": "categorical"
         },
         "22": {
             "sdtype": "categorical"
         },
         "23": {
             "sdtype": "categorical"
         },
         "24": {
             "sdtype": "categorical"
         },
         "0": {
             "sdtype": "categorical"
         }
     }
 }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions