use std::sync::Arc;
use datafusion::arrow::array::{Float64Array, TimestampMillisecondArray};
use datafusion::arrow::datatypes::TimeUnit;
use datafusion::common::DataFusionError;
use datafusion::logical_expr::{ScalarUDF, Volatility};
use datafusion::physical_plan::ColumnarValue;
use datafusion_expr::create_udf;
use datatypes::arrow::array::Array;
use datatypes::arrow::datatypes::DataType;
use crate::error;
use crate::functions::{extract_array, linear_regression};
use crate::range_array::RangeArray;
pub struct PredictLinear {
t: i64,
}
impl PredictLinear {
fn new(t: i64) -> Self {
Self { t }
}
pub const fn name() -> &'static str {
"prom_predict_linear"
}
pub fn scalar_udf(t: i64) -> ScalarUDF {
let input_types = vec![
RangeArray::convert_data_type(DataType::Timestamp(TimeUnit::Millisecond, None)),
RangeArray::convert_data_type(DataType::Float64),
];
create_udf(
Self::name(),
input_types,
DataType::Float64,
Volatility::Immutable,
Arc::new(move |input: &_| Self::new(t).predict_linear(input)) as _,
)
}
fn predict_linear(&self, input: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
assert_eq!(input.len(), 2);
let ts_array = extract_array(&input[0])?;
let value_array = extract_array(&input[1])?;
let ts_range: RangeArray = RangeArray::try_new(ts_array.to_data().into())?;
let value_range: RangeArray = RangeArray::try_new(value_array.to_data().into())?;
error::ensure(
ts_range.len() == value_range.len(),
DataFusionError::Execution(format!(
"{}: input arrays should have the same length, found {} and {}",
Self::name(),
ts_range.len(),
value_range.len()
)),
)?;
error::ensure(
ts_range.value_type() == DataType::Timestamp(TimeUnit::Millisecond, None),
DataFusionError::Execution(format!(
"{}: expect TimestampMillisecond as time index array's type, found {}",
Self::name(),
ts_range.value_type()
)),
)?;
error::ensure(
value_range.value_type() == DataType::Float64,
DataFusionError::Execution(format!(
"{}: expect Float64 as value array's type, found {}",
Self::name(),
value_range.value_type()
)),
)?;
let mut result_array = Vec::with_capacity(ts_range.len());
for index in 0..ts_range.len() {
let timestamps = ts_range
.get(index)
.unwrap()
.as_any()
.downcast_ref::<TimestampMillisecondArray>()
.unwrap()
.clone();
let values = value_range
.get(index)
.unwrap()
.as_any()
.downcast_ref::<Float64Array>()
.unwrap()
.clone();
error::ensure(
timestamps.len() == values.len(),
DataFusionError::Execution(format!(
"{}: input arrays should have the same length, found {} and {}",
Self::name(),
timestamps.len(),
values.len()
)),
)?;
let ret = predict_linear_impl(×tamps, &values, self.t);
result_array.push(ret);
}
let result = ColumnarValue::Array(Arc::new(Float64Array::from_iter(result_array)));
Ok(result)
}
}
fn predict_linear_impl(
timestamps: &TimestampMillisecondArray,
values: &Float64Array,
t: i64,
) -> Option<f64> {
if timestamps.len() < 2 {
return None;
}
let evaluate_ts = timestamps.value(timestamps.len() - 1);
let (slope, intercept) = linear_regression(timestamps, values, evaluate_ts);
if slope.is_none() || intercept.is_none() {
return None;
}
Some(slope.unwrap() * t as f64 + intercept.unwrap())
}
#[cfg(test)]
mod test {
use std::vec;
use super::*;
use crate::functions::test_util::simple_range_udf_runner;
fn build_test_range_arrays() -> (RangeArray, RangeArray) {
let ts_array = Arc::new(TimestampMillisecondArray::from_iter(
[
0i64, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3000,
]
.into_iter()
.map(Some),
));
let ranges = [(0, 11)];
let values_array = Arc::new(Float64Array::from_iter([
0.0, 10.0, 20.0, 30.0, 40.0, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0,
]));
let ts_range_array = RangeArray::from_ranges(ts_array, ranges).unwrap();
let value_range_array = RangeArray::from_ranges(values_array, ranges).unwrap();
(ts_range_array, value_range_array)
}
#[test]
fn calculate_predict_linear_none() {
let ts_array = Arc::new(TimestampMillisecondArray::from_iter(
[0i64].into_iter().map(Some),
));
let ranges = [(0, 0), (0, 1)];
let values_array = Arc::new(Float64Array::from_iter([0.0]));
let ts_array = RangeArray::from_ranges(ts_array, ranges).unwrap();
let value_array = RangeArray::from_ranges(values_array, ranges).unwrap();
simple_range_udf_runner(
PredictLinear::scalar_udf(0),
ts_array,
value_array,
vec![None, None],
);
}
#[test]
fn calculate_predict_linear_test1() {
let (ts_array, value_array) = build_test_range_arrays();
simple_range_udf_runner(
PredictLinear::scalar_udf(0),
ts_array,
value_array,
vec![Some(38.63636363636364)],
);
}
#[test]
fn calculate_predict_linear_test2() {
let (ts_array, value_array) = build_test_range_arrays();
simple_range_udf_runner(
PredictLinear::scalar_udf(3000),
ts_array,
value_array,
vec![Some(31856.818181818187)],
);
}
#[test]
fn calculate_predict_linear_test3() {
let (ts_array, value_array) = build_test_range_arrays();
simple_range_udf_runner(
PredictLinear::scalar_udf(4200),
ts_array,
value_array,
vec![Some(44584.09090909091)],
);
}
#[test]
fn calculate_predict_linear_test4() {
let (ts_array, value_array) = build_test_range_arrays();
simple_range_udf_runner(
PredictLinear::scalar_udf(6600),
ts_array,
value_array,
vec![Some(70038.63636363638)],
);
}
#[test]
fn calculate_predict_linear_test5() {
let (ts_array, value_array) = build_test_range_arrays();
simple_range_udf_runner(
PredictLinear::scalar_udf(7800),
ts_array,
value_array,
vec![Some(82765.9090909091)],
);
}
}