apache · Nagato-Yuzuru · Jun 1, 2026 · Jun 1, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -58,13 +58,11 @@ use datafusion_common::{
 };
 use datafusion_expr::select_expr::SelectExpr;
 use datafusion_expr::{
-    ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
-    dml::InsertOp,
-    expr::{Alias, ScalarFunction},
-    is_null, lit,
-    utils::COUNT_STAR_EXPANSION,
+    ExplainOption, ScalarUDF, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
+    dml::InsertOp, is_null, lit, utils::COUNT_STAR_EXPANSION,
 };
 use datafusion_functions::core::coalesce;
+use datafusion_functions::math::nanvl;
 use datafusion_functions_aggregate::expr_fn::{
     avg, count, max, median, min, stddev, sum,
 };
@@ -2471,6 +2469,65 @@ impl DataFrame {
         &self,
         value: ScalarValue,
         columns: Vec<String>,
+    ) -> Result<DataFrame> {
+        self.fill_columns(value, &columns, coalesce(), |_| true)
+    }
+
+    // Helper to find columns from names
+    fn find_columns(&self, names: &[impl AsRef<str>]) -> Result<Vec<FieldRef>> {
+        let schema = self.logical_plan().schema();
+        names
+            .iter()
+            .map(|name| {
+                let name = name.as_ref();
+                schema
+                    .field_with_name(None, name)
+                    .cloned()
+                    .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
+            })
+            .collect()
+    }
+
+    /// Fill NaN values in specified floating-point columns with a given value
+    /// If no columns are specified (empty slice), applies to all columns
+    /// Only floating-point columns are affected; other columns are left unchanged
+    /// Only fills if the value can be cast to the column's type
+    ///
+    /// # Arguments
+    /// * `value` - Value to fill NaNs with
+    /// * `columns` - List of column names to fill. If empty, fills all columns.
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use datafusion_common::ScalarValue;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// // Fill NaN in only columns "a" and "c":
+    /// let df = df.fill_nan(ScalarValue::from(0.0), &["a", "c"])?;
+    /// // Fill NaN across all columns:
+    /// let df = df.fill_nan(ScalarValue::from(0.0), &[])?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn fill_nan(&self, value: ScalarValue, columns: &[&str]) -> Result<DataFrame> {
+        self.fill_columns(value, columns, nanvl(), |field| {
+            field.data_type().is_floating()
+        })
+    }
+
+    #[expect(clippy::needless_pass_by_value)]
+    fn fill_columns(
+        &self,
+        value: ScalarValue,
+        columns: &[impl AsRef<str>],
+        func: Arc<ScalarUDF>,
+        applies: impl Fn(&FieldRef) -> bool,
     ) -> Result<DataFrame> {
         let cols = if columns.is_empty() {
             self.logical_plan()
@@ -2480,28 +2537,21 @@ impl DataFrame {
                 .map(Arc::clone)
                 .collect()
         } else {
-            self.find_columns(&columns)?
+            self.find_columns(columns)?
         };
 
-        // Create projections for each column
         let projections = self
             .logical_plan()
             .schema()
             .fields()
             .iter()
             .map(|field| {
-                if cols.contains(field) {
+                if cols.contains(field) && applies(field) {
                     // Try to cast fill value to column type. If the cast fails, fallback to the original column.
                     match value.clone().cast_to(field.data_type()) {
-                        Ok(fill_value) => Expr::Alias(Alias {
-                            expr: Box::new(Expr::ScalarFunction(ScalarFunction {
-                                func: coalesce(),
-                                args: vec![col(field.name()), lit(fill_value)],
-                            })),
-                            relation: None,
-                            name: field.name().to_string(),
-                            metadata: None,
-                        }),
+                        Ok(fill_value) => func
+                            .call(vec![col(field.name()), lit(fill_value)])
+                            .alias(field.name()),
                         Err(_) => col(field.name()),
                     }
                 } else {
@@ -2513,20 +2563,6 @@ impl DataFrame {
         self.clone().select(projections)
     }
 
-    // Helper to find columns from names
-    fn find_columns(&self, names: &[String]) -> Result<Vec<FieldRef>> {
-        let schema = self.logical_plan().schema();
-        names
-            .iter()
-            .map(|name| {
-                schema
-                    .field_with_name(None, name)
-                    .cloned()
-                    .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
-            })
-            .collect()
-    }
-
     /// Find qualified columns for this dataframe from names
     ///
     /// # Arguments

diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
@@ -6539,6 +6539,173 @@ async fn test_fill_null_all_columns() -> Result<()> {
     Ok(())
 }
 
+async fn create_nan_table() -> Result<DataFrame> {
+    // create a DataFrame with a NaN value in a float column "a" and a
+    // non-float column "b" that must stay untouched by fill_nan.
+    //    "+-----+---+",
+    //    "| a   | b |",
+    //    "+-----+---+",
+    //    "| 1.0 | 1 |",
+    //    "| NaN | 2 |",
+    //    "| 3.0 | 3 |",
+    //    "+-----+---+",
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Float64, true),
+        Field::new("b", DataType::Int32, true),
+    ]));
+    let a_values = Float64Array::from(vec![Some(1.0), Some(f64::NAN), Some(3.0)]);
+    let b_values = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(a_values), Arc::new(b_values)],
+    )?;
+
+    let ctx = SessionContext::new();
+    let table = MemTable::try_new(schema.clone(), vec![vec![batch]])?;
+    ctx.register_table("t_nan", Arc::new(table))?;
+    let df = ctx.table("t_nan").await?;
+    Ok(df)
+}
+
+#[tokio::test]
+async fn test_fill_nan() -> Result<()> {
+    let df = create_nan_table().await?;
+
+    // Fill NaNs in the float column "a" with 0.0.
+    let df_filled = df.fill_nan(ScalarValue::Float64(Some(0.0)), &["a"])?;
+
+    let results = df_filled.collect().await?;
+    assert_snapshot!(
+        batches_to_sort_string(&results),
+        @r"
+    +-----+---+
+    | a   | b |
+    +-----+---+
+    | 0.0 | 2 |
+    | 1.0 | 1 |
+    | 3.0 | 3 |
+    +-----+---+
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_fill_nan_all_columns() -> Result<()> {
+    let df = create_nan_table().await?;
+
+    // Fill NaNs across all columns. Only the float column "a" is affected;
+    // the non-float column "b" is left unchanged since NaN only exists for
+    // floating-point types.
+    let df_filled = df.fill_nan(ScalarValue::Float64(Some(0.0)), &[])?;
+
+    let results = df_filled.collect().await?;
+    assert_snapshot!(
+        batches_to_sort_string(&results),
+        @r"
+    +-----+---+
+    | a   | b |
+    +-----+---+
+    | 0.0 | 2 |
+    | 1.0 | 1 |
+    | 3.0 | 3 |
+    +-----+---+
+    "
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_fill_nan_non_float_column() -> Result<()> {
+    let df = create_nan_table().await?;
+
+    // Explicitly naming a non-float column is a no-op, not an error: NaN does
+    // not exist for Int32, so column "b" (and the un-targeted "a") are unchanged.
+    let df_filled = df.fill_nan(ScalarValue::Float64(Some(0.0)), &["b"])?;
+
+    let results = df_filled.collect().await?;
+    assert_snapshot!(
+        batches_to_sort_string(&results),
+        @r"
+    +-----+---+
+    | a   | b |
+    +-----+---+
+    | 1.0 | 1 |
+    | 3.0 | 3 |
+    | NaN | 2 |
+    +-----+---+
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_fill_nan_unknown_column() -> Result<()> {
+    let df = create_nan_table().await?;
+
+    // A column name that is not in the schema is propagated as an error.
+    let err = df
+        .fill_nan(ScalarValue::Float64(Some(0.0)), &["does_not_exist"])
+        .unwrap_err();
+
+    assert_snapshot!(err.strip_backtrace(), @"Error during planning: Column 'does_not_exist' not found");
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_fill_nan_casts_fill_value() -> Result<()> {
+    let df = create_nan_table().await?;
+
+    // Int32(0) is not the column's type (Float64) but can be cast to it, so the
+    // NaN is replaced with 0.0. Exercises the cross-type cast path — the other
+    // positive tests pass a Float64 value, which skips the actual cast.
+    let df_filled = df.fill_nan(ScalarValue::Int32(Some(0)), &["a"])?;
+
+    let results = df_filled.collect().await?;
+    assert_snapshot!(
+        batches_to_sort_string(&results),
+        @r"
+    +-----+---+
+    | a   | b |
+    +-----+---+
+    | 0.0 | 2 |
+    | 1.0 | 1 |
+    | 3.0 | 3 |
+    +-----+---+
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_fill_nan_uncastable_value() -> Result<()> {
+    let df = create_nan_table().await?;
+
+    // The float column "a" is targeted, but "abc" cannot be cast to Float64, so
+    // the fill is skipped and column "a" keeps its original NaN value.
+    let df_filled = df.fill_nan(ScalarValue::Utf8(Some("abc".to_string())), &["a"])?;
+
+    let results = df_filled.collect().await?;
+    assert_snapshot!(
+        batches_to_sort_string(&results),
+        @r"
+    +-----+---+
+    | a   | b |
+    +-----+---+
+    | 1.0 | 1 |
+    | 3.0 | 3 |
+    | NaN | 2 |
+    +-----+---+
+    "
+    );
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn test_insert_into_casting_support() -> Result<()> {
     // Testing case1:

diff --git a/parquet-testing b/parquet-testing
diff --git a/testing b/testing
+ −		data/arrow-ipc-file/clusterfuzz-testcase-arrow-ipc-file-fuzz-4778515122946048
+ −		data/arrow-ipc-file/clusterfuzz-testcase-arrow-ipc-file-fuzz-5390465250951168
+ −		data/arrow-ipc-file/clusterfuzz-testcase-arrow-ipc-file-fuzz-6051391008473088
+ −		data/arrow-ipc-file/clusterfuzz-testcase-minimized-arrow-ipc-file-fuzz-5301064831401984
+ −		data/arrow-ipc-file/clusterfuzz-testcase-minimized-arrow-ipc-file-fuzz-5527258982055936
+ −		data/arrow-ipc-file/clusterfuzz-testcase-minimized-arrow-ipc-file-fuzz-6088759971217408
+ −		data/arrow-ipc-file/clusterfuzz-testcase-minimized-arrow-ipc-file-fuzz-6126470891372544
+ −		data/arrow-ipc-file/clusterfuzz-testcase-minimized-arrow-ipc-file-fuzz-6295340960776192
+ −		data/arrow-ipc-stream/clusterfuzz-testcase-arrow-ipc-stream-fuzz-4561555323944960
+ −		data/arrow-ipc-stream/clusterfuzz-testcase-minimized-arrow-ipc-stream-fuzz-5091511766417408
+7 −0		data/parquet/encoding-fuzzing/README.md
+ −		data/parquet/encoding-fuzzing/clusterfuzz-testcase-minimized-parquet-encoding-fuzz-6365311967494144
+ −		data/parquet/encoding-fuzzing/clusterfuzz-testcase-minimized-parquet-encoding-fuzz-6493708102205440
+ −		data/parquet/encoding-fuzzing/clusterfuzz-testcase-parquet-encoding-fuzz-5530405595054080
+ −		data/parquet/encoding-fuzzing/clusterfuzz-testcase-parquet-encoding-fuzz-5921871337160704