Skip to main content

query/promql/
planner.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::{BTreeSet, HashMap, HashSet, VecDeque};
16use std::sync::Arc;
17use std::time::UNIX_EPOCH;
18
19use arrow::datatypes::IntervalDayTime;
20use async_recursion::async_recursion;
21use catalog::table_source::DfTableSourceProvider;
22use common_error::ext::ErrorExt;
23use common_error::status_code::StatusCode;
24use common_function::function::FunctionContext;
25use common_query::prelude::greptime_value;
26use datafusion::common::DFSchemaRef;
27use datafusion::datasource::DefaultTableSource;
28use datafusion::functions_aggregate::average::avg_udaf;
29use datafusion::functions_aggregate::count::count_udaf;
30use datafusion::functions_aggregate::expr_fn::first_value;
31use datafusion::functions_aggregate::min_max::{max_udaf, min_udaf};
32use datafusion::functions_aggregate::stddev::stddev_pop_udaf;
33use datafusion::functions_aggregate::sum::sum_udaf;
34use datafusion::functions_aggregate::variance::var_pop_udaf;
35use datafusion::functions_window::row_number::RowNumber;
36use datafusion::logical_expr::expr::{Alias, ScalarFunction, WindowFunction};
37use datafusion::logical_expr::expr_rewriter::normalize_cols;
38use datafusion::logical_expr::{
39    BinaryExpr, Cast, Extension, LogicalPlan, LogicalPlanBuilder, Operator,
40    ScalarUDF as ScalarUdfDef, WindowFrame, WindowFunctionDefinition,
41};
42use datafusion::prelude as df_prelude;
43use datafusion::prelude::{Column, Expr as DfExpr, JoinType};
44use datafusion::scalar::ScalarValue;
45use datafusion::sql::TableReference;
46use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
47use datafusion_common::{DFSchema, NullEquality};
48use datafusion_expr::expr::WindowFunctionParams;
49use datafusion_expr::utils::conjunction;
50use datafusion_expr::{
51    ExprSchemable, Literal, Projection, SortExpr, TableScan, TableSource, col, lit,
52};
53use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
54use datatypes::data_type::ConcreteDataType;
55use itertools::Itertools;
56use once_cell::sync::Lazy;
57use promql::extension_plan::{
58    Absent, EmptyMetric, HistogramFold, InstantManipulate, Millisecond, RangeManipulate,
59    ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn, build_special_time_expr,
60};
61use promql::functions::{
62    AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, DoubleExponentialSmoothing,
63    IDelta, Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
64    QuantileOverTime, Rate, Resets, Round, StddevOverTime, StdvarOverTime, SumOverTime,
65    quantile_udaf,
66};
67use promql_parser::label::{METRIC_NAME, MatchOp, Matcher, Matchers};
68use promql_parser::parser::token::TokenType;
69use promql_parser::parser::value::ValueType;
70use promql_parser::parser::{
71    AggregateExpr, BinModifier, BinaryExpr as PromBinaryExpr, Call, EvalStmt, Expr as PromExpr,
72    Function, FunctionArgs as PromFunctionArgs, LabelModifier, MatrixSelector, NumberLiteral,
73    Offset, ParenExpr, StringLiteral, SubqueryExpr, UnaryExpr, VectorMatchCardinality,
74    VectorSelector, token,
75};
76use regex::{self, Regex};
77use snafu::{OptionExt, ResultExt, ensure};
78use store_api::metric_engine_consts::{
79    DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY,
80    METRIC_ENGINE_NAME, is_metric_engine_internal_column,
81};
82use table::table::adapter::DfTableProviderAdapter;
83
84use crate::parser::{
85    ALIAS_NODE_NAME, ANALYZE_NODE_NAME, ANALYZE_VERBOSE_NODE_NAME, AliasExpr, EXPLAIN_NODE_NAME,
86    EXPLAIN_VERBOSE_NODE_NAME,
87};
88use crate::promql::error::{
89    CatalogSnafu, ColumnNotFoundSnafu, CombineTableColumnMismatchSnafu, DataFusionPlanningSnafu,
90    ExpectRangeSelectorSnafu, FunctionInvalidArgumentSnafu, InvalidDestinationLabelNameSnafu,
91    InvalidRegularExpressionSnafu, InvalidTimeRangeSnafu, MultiFieldsNotSupportedSnafu,
92    MultipleMetricMatchersSnafu, MultipleVectorSnafu, NoMetricMatcherSnafu, PromqlPlanNodeSnafu,
93    Result, SameLabelSetSnafu, TableNameNotFoundSnafu, TimeIndexNotFoundSnafu,
94    UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, UnknownTableSnafu, UnsupportedExprSnafu,
95    UnsupportedMatcherOpSnafu, UnsupportedVectorMatchSnafu, ValueNotFoundSnafu,
96    ZeroRangeSelectorSnafu,
97};
98use crate::query_engine::QueryEngineState;
99
100/// `time()` function in PromQL.
101const SPECIAL_TIME_FUNCTION: &str = "time";
102/// `scalar()` function in PromQL.
103const SCALAR_FUNCTION: &str = "scalar";
104/// `absent()` function in PromQL
105const SPECIAL_ABSENT_FUNCTION: &str = "absent";
106/// `histogram_quantile` function in PromQL
107const SPECIAL_HISTOGRAM_QUANTILE: &str = "histogram_quantile";
108/// `vector` function in PromQL
109const SPECIAL_VECTOR_FUNCTION: &str = "vector";
110/// `le` column for conventional histogram.
111const LE_COLUMN_NAME: &str = "le";
112
113/// Static regex for validating label names according to Prometheus specification.
114/// Label names must match the regex: [a-zA-Z_][a-zA-Z0-9_]*
115static LABEL_NAME_REGEX: Lazy<Regex> =
116    Lazy::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap());
117
118const DEFAULT_TIME_INDEX_COLUMN: &str = "time";
119
120/// default value column name for empty metric
121const DEFAULT_FIELD_COLUMN: &str = "value";
122
123/// Special modifier to project field columns under multi-field mode
124const FIELD_COLUMN_MATCHER: &str = "__field__";
125
126/// Special modifier for cross schema query
127const SCHEMA_COLUMN_MATCHER: &str = "__schema__";
128const DB_COLUMN_MATCHER: &str = "__database__";
129
130/// Prefix for generated binary island leaf aliases.
131const BINARY_ISLAND_LEAF_ALIAS_PREFIX: &str = "__prom_v";
132
133/// Threshold for scatter scan mode
134const MAX_SCATTER_POINTS: i64 = 400;
135
136/// Interval 1 hour in millisecond
137const INTERVAL_1H: i64 = 60 * 60 * 1000;
138
139#[derive(Default, Debug, Clone)]
140struct PromPlannerContext {
141    // query parameters
142    start: Millisecond,
143    end: Millisecond,
144    interval: Millisecond,
145    lookback_delta: Millisecond,
146
147    // planner states
148    table_name: Option<String>,
149    time_index_column: Option<String>,
150    field_columns: Vec<String>,
151    tag_columns: Vec<String>,
152    /// Use metric engine internal series identifier column (`__tsid`) as series key.
153    ///
154    /// This is enabled only when the underlying scan can provide `__tsid` (`UInt64`). The planner
155    /// uses it internally (e.g. as the series key for [`SeriesDivide`]) and strips it from the
156    /// final output.
157    use_tsid: bool,
158    /// The matcher for field columns `__field__`.
159    field_column_matcher: Option<Vec<Matcher>>,
160    /// The matcher for selectors (normal matchers).
161    selector_matcher: Vec<Matcher>,
162    schema_name: Option<String>,
163    /// The range in millisecond of range selector. None if there is no range selector.
164    range: Option<Millisecond>,
165}
166
167#[derive(Debug, Clone, PartialEq, Eq, Hash)]
168struct VectorLeafKey {
169    metric_name: String,
170    matchers: Vec<(String, String, String)>,
171    or_matchers: Vec<Vec<(String, String, String)>>,
172    offset_ms: i128,
173    at: String,
174}
175
176#[derive(Debug, Clone)]
177struct IslandLeaf {
178    selector: VectorSelector,
179    display_table: String,
180}
181
182#[derive(Debug, Clone)]
183enum IslandExpr {
184    VectorLeaf(usize),
185    Scalar(DfExpr),
186    Unary {
187        input: Box<IslandExpr>,
188    },
189    Binary {
190        op: TokenType,
191        lhs: Box<IslandExpr>,
192        rhs: Box<IslandExpr>,
193    },
194}
195
196impl IslandExpr {
197    fn try_new(expr: &PromExpr, env: &mut IslandCollectEnv) -> Option<Self> {
198        if let Some(expr) = PromPlanner::try_build_literal_expr(expr) {
199            return Some(Self::Scalar(expr));
200        }
201
202        match expr {
203            PromExpr::Paren(ParenExpr { expr }) => Self::try_new(expr, env),
204            PromExpr::VectorSelector(selector) => {
205                let leaf = env.intern_leaf(selector)?;
206                Some(Self::VectorLeaf(leaf))
207            }
208            PromExpr::Unary(UnaryExpr { expr }) => {
209                let input = Self::try_new(expr, env)?;
210                Some(Self::Unary {
211                    input: Box::new(input),
212                })
213            }
214            PromExpr::Binary(PromBinaryExpr {
215                lhs,
216                rhs,
217                op,
218                modifier,
219            }) if matches!(
220                op.id(),
221                token::T_ADD
222                    | token::T_SUB
223                    | token::T_MUL
224                    | token::T_DIV
225                    | token::T_MOD
226                    | token::T_POW
227                    | token::T_ATAN2
228            ) && modifier.as_ref().is_none_or(|modifier| {
229                !modifier.return_bool
230                    && modifier.matching.is_none()
231                    && matches!(modifier.card, VectorMatchCardinality::OneToOne)
232            }) =>
233            {
234                let lhs = Self::try_new(lhs, env)?;
235                let rhs = Self::try_new(rhs, env)?;
236                Some(Self::Binary {
237                    op: *op,
238                    lhs: Box::new(lhs),
239                    rhs: Box::new(rhs),
240                })
241            }
242            _ => None,
243        }
244    }
245}
246
247#[derive(Debug, Default)]
248struct IslandCollectEnv {
249    leaf_by_key: HashMap<VectorLeafKey, usize>,
250    leaves: Vec<IslandLeaf>,
251    vector_occurrences: usize,
252}
253
254#[derive(Debug)]
255struct PlannedIslandLeaf {
256    plan: LogicalPlan,
257    ctx: PromPlannerContext,
258    alias: TableReference,
259    display_table: String,
260}
261
262#[derive(Debug)]
263struct IslandFieldExprs {
264    exprs: Vec<DfExpr>,
265    names: Vec<String>,
266    scalar: bool,
267}
268
269impl VectorLeafKey {
270    fn from_selector(selector: &VectorSelector) -> Option<Self> {
271        let mut metric_name = selector.name.clone();
272        let mut matchers = Vec::with_capacity(selector.matchers.matchers.len());
273        let matcher_key = |matcher: &Matcher| {
274            (
275                matcher.name.clone(),
276                matcher.op.to_string(),
277                matcher.value.clone(),
278            )
279        };
280
281        for matcher in &selector.matchers.matchers {
282            if matcher.name == METRIC_NAME {
283                if matcher.op != MatchOp::Equal || metric_name.is_some() {
284                    return None;
285                }
286                metric_name = Some(matcher.value.clone());
287            } else {
288                matchers.push(matcher_key(matcher));
289            }
290        }
291        matchers.sort();
292
293        let mut or_matchers = selector
294            .matchers
295            .or_matchers
296            .iter()
297            .map(|group| {
298                let mut group = group.iter().map(matcher_key).collect::<Vec<_>>();
299                group.sort();
300                group
301            })
302            .collect::<Vec<_>>();
303        or_matchers.sort();
304
305        Some(Self {
306            metric_name: metric_name?,
307            matchers,
308            or_matchers,
309            offset_ms: match &selector.offset {
310                Some(Offset::Pos(duration)) => duration.as_millis() as i128,
311                Some(Offset::Neg(duration)) => -(duration.as_millis() as i128),
312                None => 0,
313            },
314            at: format!("{:?}", selector.at),
315        })
316    }
317}
318
319impl IslandCollectEnv {
320    fn intern_leaf(&mut self, selector: &VectorSelector) -> Option<usize> {
321        self.vector_occurrences += 1;
322        let key = VectorLeafKey::from_selector(selector)?;
323        if let Some(id) = self.leaf_by_key.get(&key) {
324            return Some(*id);
325        }
326
327        let id = self.leaves.len();
328        self.leaves.push(IslandLeaf {
329            selector: selector.clone(),
330            display_table: key.metric_name.clone(),
331        });
332        self.leaf_by_key.insert(key, id);
333        Some(id)
334    }
335}
336
337impl PromPlannerContext {
338    fn from_eval_stmt(stmt: &EvalStmt) -> Self {
339        Self {
340            start: stmt.start.duration_since(UNIX_EPOCH).unwrap().as_millis() as _,
341            end: stmt.end.duration_since(UNIX_EPOCH).unwrap().as_millis() as _,
342            interval: stmt.interval.as_millis() as _,
343            lookback_delta: stmt.lookback_delta.as_millis() as _,
344            ..Default::default()
345        }
346    }
347
348    /// Reset all planner states
349    fn reset(&mut self) {
350        self.table_name = None;
351        self.time_index_column = None;
352        self.field_columns = vec![];
353        self.tag_columns = vec![];
354        self.use_tsid = false;
355        self.field_column_matcher = None;
356        self.selector_matcher.clear();
357        self.schema_name = None;
358        self.range = None;
359    }
360
361    /// Reset table name and schema to empty
362    fn reset_table_name_and_schema(&mut self) {
363        self.table_name = Some(String::new());
364        self.schema_name = None;
365        self.use_tsid = false;
366    }
367
368    /// Check if `le` is present in tag columns
369    fn has_le_tag(&self) -> bool {
370        self.tag_columns.iter().any(|c| c.eq(&LE_COLUMN_NAME))
371    }
372}
373
374pub struct PromPlanner {
375    table_provider: DfTableSourceProvider,
376    ctx: PromPlannerContext,
377}
378
379impl PromPlanner {
380    pub async fn stmt_to_plan(
381        table_provider: DfTableSourceProvider,
382        stmt: &EvalStmt,
383        query_engine_state: &QueryEngineState,
384    ) -> Result<LogicalPlan> {
385        let mut planner = Self {
386            table_provider,
387            ctx: PromPlannerContext::from_eval_stmt(stmt),
388        };
389
390        let plan = planner
391            .prom_expr_to_plan(&stmt.expr, query_engine_state)
392            .await?;
393
394        // Never leak internal series identifier to output.
395        planner.strip_tsid_column(plan)
396    }
397
398    pub async fn prom_expr_to_plan(
399        &mut self,
400        prom_expr: &PromExpr,
401        query_engine_state: &QueryEngineState,
402    ) -> Result<LogicalPlan> {
403        self.prom_expr_to_plan_inner(prom_expr, false, query_engine_state)
404            .await
405    }
406
407    /**
408    Converts a PromQL expression to a logical plan.
409
410    NOTE:
411        The `timestamp_fn` indicates whether the PromQL `timestamp()` function is being evaluated in the current context.
412        If `true`, the planner generates a logical plan that projects the timestamp (time index) column
413        as the value column for each input row, implementing the PromQL `timestamp()` function semantics.
414        If `false`, the planner generates the standard logical plan for the given PromQL expression.
415    */
416    #[async_recursion]
417    async fn prom_expr_to_plan_inner(
418        &mut self,
419        prom_expr: &PromExpr,
420        timestamp_fn: bool,
421        query_engine_state: &QueryEngineState,
422    ) -> Result<LogicalPlan> {
423        let res = match prom_expr {
424            PromExpr::Aggregate(expr) => {
425                self.prom_aggr_expr_to_plan(query_engine_state, expr)
426                    .await?
427            }
428            PromExpr::Unary(expr) => {
429                self.prom_unary_expr_to_plan(query_engine_state, expr)
430                    .await?
431            }
432            PromExpr::Binary(expr) => {
433                self.prom_binary_expr_to_plan(query_engine_state, expr)
434                    .await?
435            }
436            PromExpr::Paren(ParenExpr { expr }) => {
437                self.prom_expr_to_plan_inner(expr, timestamp_fn, query_engine_state)
438                    .await?
439            }
440            PromExpr::Subquery(expr) => {
441                self.prom_subquery_expr_to_plan(query_engine_state, expr)
442                    .await?
443            }
444            PromExpr::NumberLiteral(lit) => self.prom_number_lit_to_plan(lit)?,
445            PromExpr::StringLiteral(lit) => self.prom_string_lit_to_plan(lit)?,
446            PromExpr::VectorSelector(selector) => {
447                self.prom_vector_selector_to_plan(selector, timestamp_fn)
448                    .await?
449            }
450            PromExpr::MatrixSelector(selector) => {
451                self.prom_matrix_selector_to_plan(selector).await?
452            }
453            PromExpr::Call(expr) => {
454                self.prom_call_expr_to_plan(query_engine_state, expr)
455                    .await?
456            }
457            PromExpr::Extension(expr) => {
458                self.prom_ext_expr_to_plan(query_engine_state, expr).await?
459            }
460        };
461
462        Ok(res)
463    }
464
465    async fn prom_subquery_expr_to_plan(
466        &mut self,
467        query_engine_state: &QueryEngineState,
468        subquery_expr: &SubqueryExpr,
469    ) -> Result<LogicalPlan> {
470        let SubqueryExpr {
471            expr, range, step, ..
472        } = subquery_expr;
473
474        let current_interval = self.ctx.interval;
475        if let Some(step) = step {
476            self.ctx.interval = step.as_millis() as _;
477        }
478        let current_start = self.ctx.start;
479        self.ctx.start -= range.as_millis() as i64 - self.ctx.interval;
480        let input = self.prom_expr_to_plan(expr, query_engine_state).await?;
481        self.ctx.interval = current_interval;
482        self.ctx.start = current_start;
483
484        ensure!(!range.is_zero(), ZeroRangeSelectorSnafu);
485        let range_ms = range.as_millis() as _;
486        self.ctx.range = Some(range_ms);
487
488        let time_index_column =
489            self.ctx
490                .time_index_column
491                .clone()
492                .with_context(|| TimeIndexNotFoundSnafu {
493                    table: self.ctx.table_name.clone().unwrap_or_default(),
494                })?;
495
496        // `RangeManipulate` assumes each input batch holds exactly one series
497        // (it takes tag column values from row 0 and applies them to every
498        // output row). The inner expression may emit batches that mix series,
499        // so sort by series key + time index and split into per-series batches
500        // with a `SeriesDivide` first.
501        let input_schema = input.schema();
502        let input_has_tsid = input_schema.fields().iter().any(|field| {
503            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
504                && field.data_type() == &ArrowDataType::UInt64
505        });
506        let (series_key_columns, mut sort_exprs) = if input_has_tsid {
507            (
508                vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()],
509                vec![
510                    DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME))
511                        .sort(true, true),
512                ],
513            )
514        } else {
515            // Only use tag columns that survive in the inner plan's schema —
516            // `ctx.tag_columns` can drift from the actual output.
517            let key_columns: Vec<String> = self
518                .ctx
519                .tag_columns
520                .iter()
521                .filter(|name| input_schema.has_column_with_unqualified_name(name))
522                .cloned()
523                .collect();
524            let sort = key_columns
525                .iter()
526                .map(|name| DfExpr::Column(Column::from_name(name)).sort(true, true))
527                .collect::<Vec<_>>();
528            (key_columns, sort)
529        };
530        sort_exprs.push(DfExpr::Column(Column::from_name(&time_index_column)).sort(true, true));
531
532        let sort_plan = LogicalPlanBuilder::from(input)
533            .sort(sort_exprs)
534            .context(DataFusionPlanningSnafu)?
535            .build()
536            .context(DataFusionPlanningSnafu)?;
537        let divide_plan = LogicalPlan::Extension(Extension {
538            node: Arc::new(SeriesDivide::new(
539                series_key_columns,
540                time_index_column.clone(),
541                sort_plan,
542            )),
543        });
544
545        let manipulate = RangeManipulate::new(
546            self.ctx.start,
547            self.ctx.end,
548            self.ctx.interval,
549            range_ms,
550            time_index_column,
551            self.ctx.field_columns.clone(),
552            divide_plan,
553        )
554        .context(DataFusionPlanningSnafu)?;
555
556        Ok(LogicalPlan::Extension(Extension {
557            node: Arc::new(manipulate),
558        }))
559    }
560
561    async fn prom_aggr_expr_to_plan(
562        &mut self,
563        query_engine_state: &QueryEngineState,
564        aggr_expr: &AggregateExpr,
565    ) -> Result<LogicalPlan> {
566        let AggregateExpr {
567            op,
568            expr,
569            modifier,
570            param,
571        } = aggr_expr;
572
573        let mut input = self.prom_expr_to_plan(expr, query_engine_state).await?;
574        let input_has_tsid = input.schema().fields().iter().any(|field| {
575            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
576                && field.data_type() == &ArrowDataType::UInt64
577        });
578
579        // `__tsid` based scan projection may prune tag columns. Ensure tags referenced in
580        // aggregation modifiers (`by`/`without`) are available before planning group keys.
581        let required_group_tags = match modifier {
582            None => BTreeSet::new(),
583            Some(LabelModifier::Include(labels)) => labels
584                .labels
585                .iter()
586                .filter(|label| !is_metric_engine_internal_column(label.as_str()))
587                .cloned()
588                .collect(),
589            Some(LabelModifier::Exclude(labels)) => {
590                let mut all_tags = self.collect_row_key_tag_columns_from_plan(&input)?;
591                for label in &labels.labels {
592                    let _ = all_tags.remove(label);
593                }
594                all_tags
595            }
596        };
597
598        if !required_group_tags.is_empty()
599            && required_group_tags
600                .iter()
601                .any(|tag| Self::find_case_sensitive_column(input.schema(), tag.as_str()).is_none())
602        {
603            input = self.ensure_tag_columns_available(input, &required_group_tags)?;
604            self.refresh_tag_columns_from_schema(input.schema());
605        }
606
607        match (*op).id() {
608            token::T_TOPK | token::T_BOTTOMK => {
609                self.prom_topk_bottomk_to_plan(aggr_expr, input).await
610            }
611            _ => {
612                // When `__tsid` is available, tag columns may have been pruned from the input plan.
613                // For `keep_tsid` decision we should compare against the full row-key label set,
614                // otherwise we may incorrectly treat label-reducing aggregates as preserving labels.
615                let input_tag_columns = if input_has_tsid {
616                    self.collect_row_key_tag_columns_from_plan(&input)?
617                        .into_iter()
618                        .collect::<Vec<_>>()
619                } else {
620                    self.ctx.tag_columns.clone()
621                };
622                // calculate columns to group by
623                // Need to append time index column into group by columns
624                let mut group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
625                // convert op and value columns to aggregate exprs
626                let (mut aggr_exprs, prev_field_exprs) =
627                    self.create_aggregate_exprs(*op, param, &input)?;
628
629                let keep_tsid = op.id() != token::T_COUNT_VALUES
630                    && input_has_tsid
631                    && input_tag_columns.iter().collect::<HashSet<_>>()
632                        == self.ctx.tag_columns.iter().collect::<HashSet<_>>();
633
634                if keep_tsid {
635                    aggr_exprs.push(
636                        first_value(
637                            DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME)),
638                            vec![],
639                        )
640                        .alias(DATA_SCHEMA_TSID_COLUMN_NAME),
641                    );
642                }
643                self.ctx.use_tsid = keep_tsid;
644
645                // create plan
646                let builder = LogicalPlanBuilder::from(input);
647                let builder = if op.id() == token::T_COUNT_VALUES {
648                    let label = Self::get_param_value_as_str(*op, param)?;
649                    // `count_values` must be grouped by fields,
650                    // and project the fields to the new label.
651                    group_exprs.extend(prev_field_exprs.clone());
652                    let project_fields = self
653                        .create_field_column_exprs()?
654                        .into_iter()
655                        .chain(self.create_tag_column_exprs()?)
656                        .chain(Some(self.create_time_index_column_expr()?))
657                        .chain(prev_field_exprs.into_iter().map(|expr| expr.alias(label)));
658
659                    builder
660                        .aggregate(group_exprs.clone(), aggr_exprs)
661                        .context(DataFusionPlanningSnafu)?
662                        .project(project_fields)
663                        .context(DataFusionPlanningSnafu)?
664                } else {
665                    builder
666                        .aggregate(group_exprs.clone(), aggr_exprs)
667                        .context(DataFusionPlanningSnafu)?
668                };
669
670                let sort_expr = group_exprs.into_iter().map(|expr| expr.sort(true, false));
671
672                builder
673                    .sort(sort_expr)
674                    .context(DataFusionPlanningSnafu)?
675                    .build()
676                    .context(DataFusionPlanningSnafu)
677            }
678        }
679    }
680
681    /// Create logical plan for PromQL topk and bottomk expr.
682    async fn prom_topk_bottomk_to_plan(
683        &mut self,
684        aggr_expr: &AggregateExpr,
685        input: LogicalPlan,
686    ) -> Result<LogicalPlan> {
687        let AggregateExpr {
688            op,
689            param,
690            modifier,
691            ..
692        } = aggr_expr;
693
694        let input_has_tsid = input.schema().fields().iter().any(|field| {
695            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
696                && field.data_type() == &ArrowDataType::UInt64
697        });
698        self.ctx.use_tsid = input_has_tsid;
699
700        let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, false)?;
701
702        let val = Self::get_param_as_literal_expr(param, Some(*op), Some(ArrowDataType::Float64))?;
703
704        // convert op and value columns to window exprs.
705        let window_exprs = self.create_window_exprs(*op, group_exprs.clone(), &input)?;
706
707        let rank_columns: Vec<_> = window_exprs
708            .iter()
709            .map(|expr| expr.schema_name().to_string())
710            .collect();
711
712        // Create ranks filter with `Operator::Or`.
713        // Safety: at least one rank column
714        let filter: DfExpr = rank_columns
715            .iter()
716            .fold(None, |expr, rank| {
717                let predicate = DfExpr::BinaryExpr(BinaryExpr {
718                    left: Box::new(col(rank)),
719                    op: Operator::LtEq,
720                    right: Box::new(val.clone()),
721                });
722
723                match expr {
724                    None => Some(predicate),
725                    Some(expr) => Some(DfExpr::BinaryExpr(BinaryExpr {
726                        left: Box::new(expr),
727                        op: Operator::Or,
728                        right: Box::new(predicate),
729                    })),
730                }
731            })
732            .unwrap();
733
734        let rank_columns: Vec<_> = rank_columns.into_iter().map(col).collect();
735
736        let mut new_group_exprs = group_exprs.clone();
737        // Order by ranks
738        new_group_exprs.extend(rank_columns);
739
740        let group_sort_expr = new_group_exprs
741            .into_iter()
742            .map(|expr| expr.sort(true, false));
743
744        let project_fields = self
745            .create_field_column_exprs()?
746            .into_iter()
747            .chain(self.create_tag_column_exprs()?)
748            .chain(
749                self.ctx
750                    .use_tsid
751                    .then_some(DfExpr::Column(Column::from_name(
752                        DATA_SCHEMA_TSID_COLUMN_NAME,
753                    ))),
754            )
755            .chain(Some(self.create_time_index_column_expr()?));
756
757        LogicalPlanBuilder::from(input)
758            .window(window_exprs)
759            .context(DataFusionPlanningSnafu)?
760            .filter(filter)
761            .context(DataFusionPlanningSnafu)?
762            .sort(group_sort_expr)
763            .context(DataFusionPlanningSnafu)?
764            .project(project_fields)
765            .context(DataFusionPlanningSnafu)?
766            .build()
767            .context(DataFusionPlanningSnafu)
768    }
769
770    async fn prom_unary_expr_to_plan(
771        &mut self,
772        query_engine_state: &QueryEngineState,
773        unary_expr: &UnaryExpr,
774    ) -> Result<LogicalPlan> {
775        let UnaryExpr { expr } = unary_expr;
776        // Unary Expr in PromQL implys the `-` operator
777        let input = self.prom_expr_to_plan(expr, query_engine_state).await?;
778        self.projection_for_each_field_column(input, |col| {
779            Ok(DfExpr::Negative(Box::new(DfExpr::Column(col.into()))))
780        })
781    }
782
783    async fn try_plan_binary_island(
784        &mut self,
785        binary_expr: &PromBinaryExpr,
786    ) -> Result<Option<LogicalPlan>> {
787        let original_ctx = self.ctx.clone();
788        let mut collect_env = IslandCollectEnv::default();
789        let Some(island_expr) =
790            IslandExpr::try_new(&PromExpr::Binary(binary_expr.clone()), &mut collect_env)
791        else {
792            return Ok(None);
793        };
794
795        if collect_env.leaves.is_empty()
796            || collect_env.vector_occurrences <= collect_env.leaves.len()
797        {
798            return Ok(None);
799        }
800
801        let mut planned_leaves = Vec::with_capacity(collect_env.leaves.len());
802        for (idx, leaf) in collect_env.leaves.iter().enumerate() {
803            let plan = self
804                .prom_vector_selector_to_plan(&leaf.selector, false)
805                .await?;
806            let ctx = self.ctx.clone();
807            let alias = TableReference::bare(format!("{BINARY_ISLAND_LEAF_ALIAS_PREFIX}{idx}"));
808            let plan = LogicalPlanBuilder::from(plan)
809                .alias(alias.clone())
810                .context(DataFusionPlanningSnafu)?
811                .build()
812                .context(DataFusionPlanningSnafu)?;
813            planned_leaves.push(PlannedIslandLeaf {
814                plan,
815                ctx,
816                alias,
817                display_table: leaf.display_table.clone(),
818            });
819        }
820
821        if !Self::binary_island_join_contexts_supported(&planned_leaves) {
822            self.ctx = original_ctx;
823            return Ok(None);
824        }
825
826        let mut input = planned_leaves[0].plan.clone();
827        for right_idx in 1..planned_leaves.len() {
828            input = self.join_binary_island_leaf(
829                input,
830                &planned_leaves[0],
831                &planned_leaves[right_idx],
832            )?;
833        }
834
835        let field_exprs =
836            Self::build_binary_island_field_exprs(&island_expr, &planned_leaves, input.schema())?;
837        if field_exprs.scalar || field_exprs.exprs.is_empty() {
838            self.ctx = original_ctx;
839            return Ok(None);
840        }
841
842        let plan = self.project_binary_island(
843            input,
844            &planned_leaves[0].alias,
845            &planned_leaves[0].ctx,
846            field_exprs,
847        )?;
848        Ok(Some(plan))
849    }
850
851    fn binary_island_join_contexts_supported(leaves: &[PlannedIslandLeaf]) -> bool {
852        if leaves
853            .iter()
854            .any(|leaf| leaf.ctx.time_index_column.is_none())
855        {
856            return false;
857        }
858
859        if leaves.len() <= 1 {
860            return true;
861        }
862
863        let first_tags = leaves[0].ctx.tag_columns.iter().collect::<BTreeSet<_>>();
864
865        leaves.iter().skip(1).all(|leaf| {
866            (Self::plan_has_tsid_column(&leaves[0].plan) && Self::plan_has_tsid_column(&leaf.plan))
867                || leaf.ctx.tag_columns.iter().collect::<BTreeSet<_>>() == first_tags
868        })
869    }
870
871    fn join_binary_island_leaf(
872        &self,
873        left: LogicalPlan,
874        first_leaf: &PlannedIslandLeaf,
875        right_leaf: &PlannedIslandLeaf,
876    ) -> Result<LogicalPlan> {
877        let only_join_time_index =
878            first_leaf.ctx.tag_columns.is_empty() || right_leaf.ctx.tag_columns.is_empty();
879        let (mut left_keys, mut right_keys, force_empty_join) = self.binary_join_key_columns(
880            left.schema(),
881            right_leaf.plan.schema(),
882            &first_leaf.ctx,
883            &right_leaf.ctx,
884            only_join_time_index,
885            &None,
886        )?;
887
888        if let (Some(left_time_index_column), Some(right_time_index_column)) = (
889            first_leaf.ctx.time_index_column.clone(),
890            right_leaf.ctx.time_index_column.clone(),
891        ) {
892            left_keys.insert(left_time_index_column);
893            right_keys.insert(right_time_index_column);
894        }
895
896        LogicalPlanBuilder::from(left)
897            .join_detailed(
898                right_leaf.plan.clone(),
899                JoinType::Inner,
900                (
901                    left_keys
902                        .into_iter()
903                        .map(|name| Column::new(Some(first_leaf.alias.clone()), name))
904                        .collect::<Vec<_>>(),
905                    right_keys
906                        .into_iter()
907                        .map(|name| Column::new(Some(right_leaf.alias.clone()), name))
908                        .collect::<Vec<_>>(),
909                ),
910                force_empty_join.then_some(lit(false)),
911                NullEquality::NullEqualsNull,
912            )
913            .context(DataFusionPlanningSnafu)?
914            .build()
915            .context(DataFusionPlanningSnafu)
916    }
917
918    fn build_binary_island_field_exprs(
919        expr: &IslandExpr,
920        leaves: &[PlannedIslandLeaf],
921        schema: &DFSchemaRef,
922    ) -> Result<IslandFieldExprs> {
923        match expr {
924            IslandExpr::VectorLeaf(id) => {
925                let leaf = &leaves[*id];
926                let exprs = leaf
927                    .ctx
928                    .field_columns
929                    .iter()
930                    .map(|field| {
931                        schema
932                            .qualified_field_with_name(Some(&leaf.alias), field)
933                            .context(DataFusionPlanningSnafu)
934                            .map(|field| DfExpr::Column(field.into()))
935                    })
936                    .collect::<Result<Vec<_>>>()?;
937                let names = leaf
938                    .ctx
939                    .field_columns
940                    .iter()
941                    .map(|field| format!("{}.{}", leaf.display_table, field))
942                    .collect();
943                Ok(IslandFieldExprs {
944                    exprs,
945                    names,
946                    scalar: false,
947                })
948            }
949            IslandExpr::Scalar(expr) => Ok(IslandFieldExprs {
950                exprs: vec![expr.clone()],
951                names: vec![expr.schema_name().to_string()],
952                scalar: true,
953            }),
954            IslandExpr::Unary { input } => {
955                let input = Self::build_binary_island_field_exprs(input, leaves, schema)?;
956                let mut exprs = Vec::with_capacity(input.exprs.len());
957                let mut names = Vec::with_capacity(input.names.len());
958                for (expr, name) in input.exprs.into_iter().zip(input.names) {
959                    exprs.push(DfExpr::Negative(Box::new(expr)));
960                    names.push(format!("-{name}"));
961                }
962                Ok(IslandFieldExprs {
963                    exprs,
964                    names,
965                    scalar: input.scalar,
966                })
967            }
968            IslandExpr::Binary { op, lhs, rhs } => {
969                let same_leaf = match (&**lhs, &**rhs) {
970                    (IslandExpr::VectorLeaf(left), IslandExpr::VectorLeaf(right))
971                        if left == right =>
972                    {
973                        Some(*left)
974                    }
975                    _ => None,
976                };
977                let lhs = Self::build_binary_island_field_exprs(lhs, leaves, schema)?;
978                let rhs = Self::build_binary_island_field_exprs(rhs, leaves, schema)?;
979                let expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
980                let scalar = lhs.scalar && rhs.scalar;
981                let op = op.to_string();
982
983                let (exprs, names) = match (lhs.scalar, rhs.scalar) {
984                    (true, true) => {
985                        let expr = expr_builder(lhs.exprs[0].clone(), rhs.exprs[0].clone())?;
986                        let name = format!("{} {op} {}", lhs.names[0], rhs.names[0]);
987                        (vec![expr], vec![name])
988                    }
989                    (true, false) => {
990                        let mut exprs = Vec::with_capacity(rhs.exprs.len());
991                        let mut names = Vec::with_capacity(rhs.names.len());
992                        for (rhs_expr, rhs_name) in rhs.exprs.into_iter().zip(rhs.names) {
993                            exprs.push(expr_builder(lhs.exprs[0].clone(), rhs_expr)?);
994                            names.push(format!("{} {op} {rhs_name}", lhs.names[0]));
995                        }
996                        (exprs, names)
997                    }
998                    (false, true) => {
999                        let mut exprs = Vec::with_capacity(lhs.exprs.len());
1000                        let mut names = Vec::with_capacity(lhs.names.len());
1001                        for (lhs_expr, lhs_name) in lhs.exprs.into_iter().zip(lhs.names) {
1002                            exprs.push(expr_builder(lhs_expr, rhs.exprs[0].clone())?);
1003                            names.push(format!("{lhs_name} {op} {}", rhs.names[0]));
1004                        }
1005                        (exprs, names)
1006                    }
1007                    (false, false) => {
1008                        let mut exprs = Vec::new();
1009                        let mut names = Vec::new();
1010                        for (idx, ((lhs_expr, rhs_expr), (mut lhs_name, mut rhs_name))) in lhs
1011                            .exprs
1012                            .into_iter()
1013                            .zip(rhs.exprs)
1014                            .zip(lhs.names.into_iter().zip(rhs.names))
1015                            .enumerate()
1016                        {
1017                            if let Some(leaf) = same_leaf {
1018                                let field = leaves[leaf]
1019                                    .ctx
1020                                    .field_columns
1021                                    .get(idx)
1022                                    .cloned()
1023                                    .unwrap_or_else(|| lhs_name.clone());
1024                                lhs_name = format!("lhs.{field}");
1025                                rhs_name = format!("rhs.{field}");
1026                            }
1027                            exprs.push(expr_builder(lhs_expr, rhs_expr)?);
1028                            names.push(format!("{lhs_name} {op} {rhs_name}"));
1029                        }
1030                        (exprs, names)
1031                    }
1032                };
1033
1034                Ok(IslandFieldExprs {
1035                    exprs,
1036                    names,
1037                    scalar,
1038                })
1039            }
1040        }
1041    }
1042
1043    fn project_binary_island(
1044        &mut self,
1045        input: LogicalPlan,
1046        base_alias: &TableReference,
1047        base_ctx: &PromPlannerContext,
1048        field_exprs: IslandFieldExprs,
1049    ) -> Result<LogicalPlan> {
1050        self.ctx = base_ctx.clone();
1051
1052        let schema = input.schema();
1053        let non_field_exprs = base_ctx
1054            .tag_columns
1055            .iter()
1056            .chain(base_ctx.time_index_column.iter())
1057            .map(|column| {
1058                schema
1059                    .qualified_field_with_name(Some(base_alias), column)
1060                    .context(DataFusionPlanningSnafu)
1061                    .map(|field| DfExpr::Column(field.into()))
1062            });
1063        let tsid_expr = Self::optional_tsid_projection(schema, Some(base_alias), base_ctx.use_tsid)
1064            .into_iter()
1065            .map(Ok);
1066
1067        self.ctx.field_columns = field_exprs.names;
1068        let field_exprs = field_exprs
1069            .exprs
1070            .into_iter()
1071            .zip(self.ctx.field_columns.iter())
1072            .map(|(expr, name)| Ok(DfExpr::Alias(Alias::new(expr, None::<String>, name))));
1073
1074        let project_exprs = non_field_exprs
1075            .chain(tsid_expr)
1076            .chain(field_exprs)
1077            .collect::<Result<Vec<_>>>()?;
1078
1079        let plan = LogicalPlanBuilder::from(input)
1080            .project(project_exprs)
1081            .context(DataFusionPlanningSnafu)?
1082            .build()
1083            .context(DataFusionPlanningSnafu)?;
1084
1085        self.ctx.table_name = None;
1086        self.ctx.schema_name = None;
1087
1088        Ok(plan)
1089    }
1090
1091    async fn prom_binary_expr_to_plan(
1092        &mut self,
1093        query_engine_state: &QueryEngineState,
1094        binary_expr: &PromBinaryExpr,
1095    ) -> Result<LogicalPlan> {
1096        if let Some(plan) = self.try_plan_binary_island(binary_expr).await? {
1097            return Ok(plan);
1098        }
1099
1100        let PromBinaryExpr {
1101            lhs,
1102            rhs,
1103            op,
1104            modifier,
1105        } = binary_expr;
1106
1107        // if set to true, comparison operator will return 0/1 (for true/false) instead of
1108        // filter on the result column
1109        let should_return_bool = if let Some(m) = modifier {
1110            m.return_bool
1111        } else {
1112            false
1113        };
1114        let is_comparison_op = Self::is_token_a_comparison_op(*op);
1115
1116        // we should build a filter plan here if the op is comparison op and need not
1117        // to return 0/1. Otherwise, we should build a projection plan
1118        match (
1119            Self::try_build_literal_expr(lhs),
1120            Self::try_build_literal_expr(rhs),
1121        ) {
1122            (Some(lhs), Some(rhs)) => {
1123                self.ctx.time_index_column = Some(DEFAULT_TIME_INDEX_COLUMN.to_string());
1124                self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
1125                self.ctx.reset_table_name_and_schema();
1126                let field_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
1127                let mut field_expr = field_expr_builder(lhs, rhs)?;
1128
1129                if is_comparison_op && should_return_bool {
1130                    field_expr = DfExpr::Cast(Cast {
1131                        expr: Box::new(field_expr),
1132                        data_type: ArrowDataType::Float64,
1133                    });
1134                }
1135
1136                Ok(LogicalPlan::Extension(Extension {
1137                    node: Arc::new(
1138                        EmptyMetric::new(
1139                            self.ctx.start,
1140                            self.ctx.end,
1141                            self.ctx.interval,
1142                            SPECIAL_TIME_FUNCTION.to_string(),
1143                            DEFAULT_FIELD_COLUMN.to_string(),
1144                            Some(field_expr),
1145                        )
1146                        .context(DataFusionPlanningSnafu)?,
1147                    ),
1148                }))
1149            }
1150            // lhs is a literal, rhs is a column
1151            (Some(mut expr), None) => {
1152                let input = self.prom_expr_to_plan(rhs, query_engine_state).await?;
1153                // check if the literal is a special time expr
1154                if let Some(time_expr) = self.try_build_special_time_expr_with_context(lhs) {
1155                    expr = time_expr
1156                }
1157                let bin_expr_builder = |col: &String| {
1158                    let binary_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
1159                    let mut binary_expr =
1160                        binary_expr_builder(expr.clone(), DfExpr::Column(col.into()))?;
1161
1162                    if is_comparison_op && should_return_bool {
1163                        binary_expr = DfExpr::Cast(Cast {
1164                            expr: Box::new(binary_expr),
1165                            data_type: ArrowDataType::Float64,
1166                        });
1167                    }
1168                    Ok(binary_expr)
1169                };
1170                if is_comparison_op && !should_return_bool {
1171                    self.filter_on_field_column(input, bin_expr_builder)
1172                } else {
1173                    self.projection_for_each_field_column(input, bin_expr_builder)
1174                }
1175            }
1176            // lhs is a column, rhs is a literal
1177            (None, Some(mut expr)) => {
1178                let input = self.prom_expr_to_plan(lhs, query_engine_state).await?;
1179                // check if the literal is a special time expr
1180                if let Some(time_expr) = self.try_build_special_time_expr_with_context(rhs) {
1181                    expr = time_expr
1182                }
1183                let bin_expr_builder = |col: &String| {
1184                    let binary_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
1185                    let mut binary_expr =
1186                        binary_expr_builder(DfExpr::Column(col.into()), expr.clone())?;
1187
1188                    if is_comparison_op && should_return_bool {
1189                        binary_expr = DfExpr::Cast(Cast {
1190                            expr: Box::new(binary_expr),
1191                            data_type: ArrowDataType::Float64,
1192                        });
1193                    }
1194                    Ok(binary_expr)
1195                };
1196                if is_comparison_op && !should_return_bool {
1197                    self.filter_on_field_column(input, bin_expr_builder)
1198                } else {
1199                    self.projection_for_each_field_column(input, bin_expr_builder)
1200                }
1201            }
1202            // both are columns. join them on time index
1203            (None, None) => {
1204                let left_input = self.prom_expr_to_plan(lhs, query_engine_state).await?;
1205                let left_field_columns = self.ctx.field_columns.clone();
1206                let left_time_index_column = self.ctx.time_index_column.clone();
1207                let mut left_table_ref = self
1208                    .table_ref()
1209                    .unwrap_or_else(|_| TableReference::bare(""));
1210                let left_context = self.ctx.clone();
1211
1212                let right_input = self.prom_expr_to_plan(rhs, query_engine_state).await?;
1213                let right_field_columns = self.ctx.field_columns.clone();
1214                let right_time_index_column = self.ctx.time_index_column.clone();
1215                let mut right_table_ref = self
1216                    .table_ref()
1217                    .unwrap_or_else(|_| TableReference::bare(""));
1218                let right_context = self.ctx.clone();
1219
1220                // TODO(ruihang): avoid join if left and right are the same table
1221
1222                // set op has "special" join semantics
1223                if Self::is_token_a_set_op(*op) {
1224                    return self.set_op_on_non_field_columns(
1225                        left_input,
1226                        right_input,
1227                        left_context,
1228                        right_context,
1229                        *op,
1230                        modifier,
1231                    );
1232                }
1233
1234                // normal join
1235                if left_table_ref == right_table_ref {
1236                    // rename table references to avoid ambiguity
1237                    left_table_ref = TableReference::bare("lhs");
1238                    right_table_ref = TableReference::bare("rhs");
1239                    // `self.ctx` have ctx in right plan, if right plan have no tag,
1240                    // we use left plan ctx as the ctx for subsequent calculations,
1241                    // to avoid case like `host + scalar(...)`
1242                    // we need preserve tag column on `host` table in subsequent projection,
1243                    // which only show in left plan ctx.
1244                    if self.ctx.tag_columns.is_empty() {
1245                        self.ctx = left_context.clone();
1246                        self.ctx.table_name = Some("lhs".to_string());
1247                    } else {
1248                        self.ctx.table_name = Some("rhs".to_string());
1249                    }
1250                }
1251                let (output_field_columns, field_columns) =
1252                    Self::align_binary_field_columns(&left_field_columns, &right_field_columns);
1253                let left_aligned_field_columns = field_columns
1254                    .iter()
1255                    .map(|(left_col_name, _)| (*left_col_name).clone())
1256                    .collect::<Vec<_>>();
1257                let right_aligned_field_columns = field_columns
1258                    .iter()
1259                    .map(|(_, right_col_name)| (*right_col_name).clone())
1260                    .collect::<Vec<_>>();
1261                // PromQL binary arithmetic only combines the shared prefix of value columns.
1262                // Keep the output field count aligned with that zipped prefix so planning
1263                // remains stable even when the two sides have uneven multi-field schemas.
1264                self.ctx.field_columns = output_field_columns;
1265                let mut field_columns = field_columns.into_iter();
1266
1267                let join_plan = self.join_on_non_field_columns(
1268                    left_input,
1269                    right_input,
1270                    left_table_ref.clone(),
1271                    right_table_ref.clone(),
1272                    left_time_index_column,
1273                    right_time_index_column,
1274                    // if left plan or right plan tag is empty, means case like `scalar(...) + host` or `host + scalar(...)`
1275                    // under this case we only join on time index
1276                    left_context.tag_columns.is_empty() || right_context.tag_columns.is_empty(),
1277                    modifier,
1278                    &left_context,
1279                    &right_context,
1280                )?;
1281                let join_plan_schema = join_plan.schema().clone();
1282
1283                let bin_expr_builder = |_: &String| {
1284                    let (left_col_name, right_col_name) = field_columns.next().unwrap();
1285                    let left_col = join_plan_schema
1286                        .qualified_field_with_name(Some(&left_table_ref), left_col_name)
1287                        .context(DataFusionPlanningSnafu)?
1288                        .into();
1289                    let right_col = join_plan_schema
1290                        .qualified_field_with_name(Some(&right_table_ref), right_col_name)
1291                        .context(DataFusionPlanningSnafu)?
1292                        .into();
1293
1294                    let binary_expr_builder = Self::prom_token_to_binary_expr_builder(*op)?;
1295                    let mut binary_expr =
1296                        binary_expr_builder(DfExpr::Column(left_col), DfExpr::Column(right_col))?;
1297                    if is_comparison_op && should_return_bool {
1298                        binary_expr = DfExpr::Cast(Cast {
1299                            expr: Box::new(binary_expr),
1300                            data_type: ArrowDataType::Float64,
1301                        });
1302                    }
1303                    Ok(binary_expr)
1304                };
1305                if is_comparison_op && !should_return_bool {
1306                    // PromQL comparison operators without `bool` are filters:
1307                    //   - keep the instant-vector side sample values
1308                    //   - drop samples where the comparison is false
1309                    //
1310                    // So we filter on the join result and then project only the side that should
1311                    // be preserved according to PromQL semantics.
1312                    let filtered = self.filter_on_field_column(join_plan, bin_expr_builder)?;
1313                    let (project_table_ref, mut project_context, project_field_columns) =
1314                        match (lhs.value_type(), rhs.value_type()) {
1315                            (ValueType::Scalar, ValueType::Vector) => (
1316                                &right_table_ref,
1317                                right_context.clone(),
1318                                right_aligned_field_columns,
1319                            ),
1320                            _ => (
1321                                &left_table_ref,
1322                                left_context.clone(),
1323                                left_aligned_field_columns,
1324                            ),
1325                        };
1326                    project_context.field_columns = project_field_columns;
1327                    self.project_binary_join_side(filtered, project_table_ref, &project_context)
1328                } else {
1329                    self.projection_for_each_field_column(join_plan, bin_expr_builder)
1330                }
1331            }
1332        }
1333    }
1334
1335    fn project_binary_join_side(
1336        &mut self,
1337        input: LogicalPlan,
1338        table_ref: &TableReference,
1339        context: &PromPlannerContext,
1340    ) -> Result<LogicalPlan> {
1341        let schema = input.schema();
1342
1343        let mut project_exprs =
1344            Vec::with_capacity(context.tag_columns.len() + context.field_columns.len() + 2);
1345
1346        // Project time index from the chosen side.
1347        if let Some(time_index_column) = &context.time_index_column {
1348            let time_index_col = schema
1349                .qualified_field_with_name(Some(table_ref), time_index_column)
1350                .context(DataFusionPlanningSnafu)?
1351                .into();
1352            project_exprs.push(DfExpr::Column(time_index_col));
1353        }
1354
1355        // Project field columns from the chosen side.
1356        for field_column in &context.field_columns {
1357            let field_col = schema
1358                .qualified_field_with_name(Some(table_ref), field_column)
1359                .context(DataFusionPlanningSnafu)?
1360                .into();
1361            project_exprs.push(DfExpr::Column(field_col));
1362        }
1363
1364        // Project tag columns from the chosen side.
1365        for tag_column in &context.tag_columns {
1366            let tag_col = schema
1367                .qualified_field_with_name(Some(table_ref), tag_column)
1368                .context(DataFusionPlanningSnafu)?
1369                .into();
1370            project_exprs.push(DfExpr::Column(tag_col));
1371        }
1372
1373        // Preserve `__tsid` if present, so it can still be used internally downstream. It's
1374        // stripped from the final output anyway.
1375        if let Some(tsid_col) =
1376            Self::optional_tsid_projection(schema, Some(table_ref), context.use_tsid)
1377        {
1378            project_exprs.push(tsid_col);
1379        }
1380
1381        let plan = LogicalPlanBuilder::from(input)
1382            .project(project_exprs)
1383            .context(DataFusionPlanningSnafu)?
1384            .build()
1385            .context(DataFusionPlanningSnafu)?;
1386
1387        // Update context to reflect the projected schema. Don't keep a table qualifier since
1388        // the result is a derived expression.
1389        self.ctx = context.clone();
1390        self.ctx.table_name = None;
1391        self.ctx.schema_name = None;
1392
1393        Ok(plan)
1394    }
1395
1396    fn prom_number_lit_to_plan(&mut self, number_literal: &NumberLiteral) -> Result<LogicalPlan> {
1397        let NumberLiteral { val } = number_literal;
1398        self.ctx.time_index_column = Some(DEFAULT_TIME_INDEX_COLUMN.to_string());
1399        self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
1400        self.ctx.reset_table_name_and_schema();
1401        let literal_expr = df_prelude::lit(*val);
1402
1403        let plan = LogicalPlan::Extension(Extension {
1404            node: Arc::new(
1405                EmptyMetric::new(
1406                    self.ctx.start,
1407                    self.ctx.end,
1408                    self.ctx.interval,
1409                    SPECIAL_TIME_FUNCTION.to_string(),
1410                    DEFAULT_FIELD_COLUMN.to_string(),
1411                    Some(literal_expr),
1412                )
1413                .context(DataFusionPlanningSnafu)?,
1414            ),
1415        });
1416        Ok(plan)
1417    }
1418
1419    fn prom_string_lit_to_plan(&mut self, string_literal: &StringLiteral) -> Result<LogicalPlan> {
1420        let StringLiteral { val } = string_literal;
1421        self.ctx.time_index_column = Some(DEFAULT_TIME_INDEX_COLUMN.to_string());
1422        self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
1423        self.ctx.reset_table_name_and_schema();
1424        let literal_expr = df_prelude::lit(val.clone());
1425
1426        let plan = LogicalPlan::Extension(Extension {
1427            node: Arc::new(
1428                EmptyMetric::new(
1429                    self.ctx.start,
1430                    self.ctx.end,
1431                    self.ctx.interval,
1432                    SPECIAL_TIME_FUNCTION.to_string(),
1433                    DEFAULT_FIELD_COLUMN.to_string(),
1434                    Some(literal_expr),
1435                )
1436                .context(DataFusionPlanningSnafu)?,
1437            ),
1438        });
1439        Ok(plan)
1440    }
1441
1442    async fn prom_vector_selector_to_plan(
1443        &mut self,
1444        vector_selector: &VectorSelector,
1445        timestamp_fn: bool,
1446    ) -> Result<LogicalPlan> {
1447        let VectorSelector {
1448            name,
1449            offset,
1450            matchers,
1451            at: _,
1452        } = vector_selector;
1453        let matchers = self.preprocess_label_matchers(matchers, name)?;
1454        if let Some(empty_plan) = self.setup_context().await? {
1455            return Ok(empty_plan);
1456        }
1457        let normalize = self
1458            .selector_to_series_normalize_plan(offset, matchers, false)
1459            .await?;
1460
1461        let normalize = if timestamp_fn {
1462            // If evaluating the PromQL `timestamp()` function, project the time index column as the value column
1463            // before wrapping with [`InstantManipulate`], so the output matches PromQL's `timestamp()` semantics.
1464            self.create_timestamp_func_plan(normalize)?
1465        } else {
1466            normalize
1467        };
1468
1469        let manipulate = InstantManipulate::new(
1470            self.ctx.start,
1471            self.ctx.end,
1472            self.ctx.lookback_delta,
1473            self.ctx.interval,
1474            self.ctx
1475                .time_index_column
1476                .clone()
1477                .expect("time index should be set in `setup_context`"),
1478            if self.ctx.use_tsid {
1479                vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]
1480            } else {
1481                self.ctx.tag_columns.clone()
1482            },
1483            self.ctx.field_columns.first().cloned(),
1484            normalize,
1485        );
1486        Ok(LogicalPlan::Extension(Extension {
1487            node: Arc::new(manipulate),
1488        }))
1489    }
1490
1491    /// Builds a projection plan for the PromQL `timestamp()` function.
1492    /// Projects the time index column as the value column for each row.
1493    ///
1494    /// # Arguments
1495    /// * `normalize` - Input [`LogicalPlan`] for the normalized series.
1496    ///
1497    /// # Returns
1498    /// Returns a [`Result<LogicalPlan>`] where the resulting logical plan projects the timestamp
1499    /// column as the value column, along with the original tag and time index columns.
1500    ///
1501    /// # Timestamp vs. Time Function
1502    ///
1503    /// - **Timestamp Function (`timestamp()`)**: In PromQL, the `timestamp()` function returns the
1504    ///   timestamp (time index) of each sample as the value column.
1505    ///
1506    /// - **Time Function (`time()`)**: The `time()` function returns the evaluation time of the query
1507    ///   as a scalar value.
1508    ///
1509    /// # Side Effects
1510    /// Updates the planner context's field columns to the timestamp column name.
1511    ///
1512    fn create_timestamp_func_plan(&mut self, normalize: LogicalPlan) -> Result<LogicalPlan> {
1513        let time_expr = build_special_time_expr(self.ctx.time_index_column.as_ref().unwrap())
1514            .alias(DEFAULT_FIELD_COLUMN);
1515        self.ctx.field_columns = vec![time_expr.schema_name().to_string()];
1516        let mut project_exprs = Vec::with_capacity(self.ctx.tag_columns.len() + 2);
1517        project_exprs.push(self.create_time_index_column_expr()?);
1518        project_exprs.push(time_expr);
1519        project_exprs.extend(self.create_tag_column_exprs()?);
1520
1521        LogicalPlanBuilder::from(normalize)
1522            .project(project_exprs)
1523            .context(DataFusionPlanningSnafu)?
1524            .build()
1525            .context(DataFusionPlanningSnafu)
1526    }
1527
1528    async fn prom_matrix_selector_to_plan(
1529        &mut self,
1530        matrix_selector: &MatrixSelector,
1531    ) -> Result<LogicalPlan> {
1532        let MatrixSelector { vs, range } = matrix_selector;
1533        let VectorSelector {
1534            name,
1535            offset,
1536            matchers,
1537            ..
1538        } = vs;
1539        let matchers = self.preprocess_label_matchers(matchers, name)?;
1540        ensure!(!range.is_zero(), ZeroRangeSelectorSnafu);
1541        let range_ms = range.as_millis() as _;
1542        self.ctx.range = Some(range_ms);
1543
1544        // Some functions like rate may require special fields in the RangeManipulate plan
1545        // so we can't skip RangeManipulate.
1546        let normalize = match self.setup_context().await? {
1547            Some(empty_plan) => empty_plan,
1548            None => {
1549                self.selector_to_series_normalize_plan(offset, matchers, true)
1550                    .await?
1551            }
1552        };
1553        let manipulate = RangeManipulate::new(
1554            self.ctx.start,
1555            self.ctx.end,
1556            self.ctx.interval,
1557            // TODO(ruihang): convert via Timestamp datatypes to support different time units
1558            range_ms,
1559            self.ctx
1560                .time_index_column
1561                .clone()
1562                .expect("time index should be set in `setup_context`"),
1563            self.ctx.field_columns.clone(),
1564            normalize,
1565        )
1566        .context(DataFusionPlanningSnafu)?;
1567
1568        Ok(LogicalPlan::Extension(Extension {
1569            node: Arc::new(manipulate),
1570        }))
1571    }
1572
1573    async fn prom_call_expr_to_plan(
1574        &mut self,
1575        query_engine_state: &QueryEngineState,
1576        call_expr: &Call,
1577    ) -> Result<LogicalPlan> {
1578        let Call { func, args } = call_expr;
1579        // some special functions that are not expression but a plan
1580        match func.name {
1581            SPECIAL_HISTOGRAM_QUANTILE => {
1582                return self.create_histogram_plan(args, query_engine_state).await;
1583            }
1584            SPECIAL_VECTOR_FUNCTION => return self.create_vector_plan(args).await,
1585            SCALAR_FUNCTION => return self.create_scalar_plan(args, query_engine_state).await,
1586            SPECIAL_ABSENT_FUNCTION => {
1587                return self.create_absent_plan(args, query_engine_state).await;
1588            }
1589            _ => {}
1590        }
1591
1592        // transform function arguments
1593        let args = self.create_function_args(&args.args)?;
1594        let input = if let Some(prom_expr) = &args.input {
1595            self.prom_expr_to_plan_inner(prom_expr, func.name == "timestamp", query_engine_state)
1596                .await?
1597        } else {
1598            self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
1599            self.ctx.reset_table_name_and_schema();
1600            self.ctx.tag_columns = vec![];
1601            self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
1602            LogicalPlan::Extension(Extension {
1603                node: Arc::new(
1604                    EmptyMetric::new(
1605                        self.ctx.start,
1606                        self.ctx.end,
1607                        self.ctx.interval,
1608                        SPECIAL_TIME_FUNCTION.to_string(),
1609                        DEFAULT_FIELD_COLUMN.to_string(),
1610                        None,
1611                    )
1612                    .context(DataFusionPlanningSnafu)?,
1613                ),
1614            })
1615        };
1616        let (mut func_exprs, new_tags) =
1617            self.create_function_expr(func, args.literals.clone(), query_engine_state)?;
1618        func_exprs.insert(0, self.create_time_index_column_expr()?);
1619        func_exprs.extend_from_slice(&self.create_tag_column_exprs()?);
1620        if let Some(tsid_col) =
1621            Self::optional_tsid_projection(input.schema(), None, self.ctx.use_tsid)
1622        {
1623            func_exprs.push(tsid_col);
1624        }
1625
1626        let builder = LogicalPlanBuilder::from(input)
1627            .project(func_exprs)
1628            .context(DataFusionPlanningSnafu)?
1629            .filter(self.create_empty_values_filter_expr()?)
1630            .context(DataFusionPlanningSnafu)?;
1631
1632        let builder = match func.name {
1633            "sort" => builder
1634                .sort(self.create_field_columns_sort_exprs(true))
1635                .context(DataFusionPlanningSnafu)?,
1636            "sort_desc" => builder
1637                .sort(self.create_field_columns_sort_exprs(false))
1638                .context(DataFusionPlanningSnafu)?,
1639            "sort_by_label" => builder
1640                .sort(Self::create_sort_exprs_by_tags(
1641                    func.name,
1642                    args.literals,
1643                    true,
1644                )?)
1645                .context(DataFusionPlanningSnafu)?,
1646            "sort_by_label_desc" => builder
1647                .sort(Self::create_sort_exprs_by_tags(
1648                    func.name,
1649                    args.literals,
1650                    false,
1651                )?)
1652                .context(DataFusionPlanningSnafu)?,
1653
1654            _ => builder,
1655        };
1656
1657        // Update context tags after building plan
1658        // We can't push them before planning, because they won't exist until projection.
1659        for tag in new_tags {
1660            self.ctx.tag_columns.push(tag);
1661        }
1662
1663        let plan = builder.build().context(DataFusionPlanningSnafu)?;
1664        common_telemetry::debug!("Created PromQL function plan: {plan:?} for {call_expr:?}");
1665
1666        Ok(plan)
1667    }
1668
1669    async fn prom_ext_expr_to_plan(
1670        &mut self,
1671        query_engine_state: &QueryEngineState,
1672        ext_expr: &promql_parser::parser::ast::Extension,
1673    ) -> Result<LogicalPlan> {
1674        // let promql_parser::parser::ast::Extension { expr } = ext_expr;
1675        let expr = &ext_expr.expr;
1676        let children = expr.children();
1677        let plan = self
1678            .prom_expr_to_plan(&children[0], query_engine_state)
1679            .await?;
1680        // Wrapper for the explanation/analyze of the existing plan
1681        // https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html#method.explain
1682        // if `analyze` is true, runs the actual plan and produces
1683        // information about metrics during run.
1684        // if `verbose` is true, prints out additional details when VERBOSE keyword is specified
1685        match expr.name() {
1686            ANALYZE_NODE_NAME => LogicalPlanBuilder::from(plan)
1687                .explain(false, true)
1688                .unwrap()
1689                .build()
1690                .context(DataFusionPlanningSnafu),
1691            ANALYZE_VERBOSE_NODE_NAME => LogicalPlanBuilder::from(plan)
1692                .explain(true, true)
1693                .unwrap()
1694                .build()
1695                .context(DataFusionPlanningSnafu),
1696            EXPLAIN_NODE_NAME => LogicalPlanBuilder::from(plan)
1697                .explain(false, false)
1698                .unwrap()
1699                .build()
1700                .context(DataFusionPlanningSnafu),
1701            EXPLAIN_VERBOSE_NODE_NAME => LogicalPlanBuilder::from(plan)
1702                .explain(true, false)
1703                .unwrap()
1704                .build()
1705                .context(DataFusionPlanningSnafu),
1706            ALIAS_NODE_NAME => {
1707                let alias = expr
1708                    .as_any()
1709                    .downcast_ref::<AliasExpr>()
1710                    .context(UnexpectedPlanExprSnafu {
1711                        desc: "Expected AliasExpr",
1712                    })?
1713                    .alias
1714                    .clone();
1715                self.apply_alias(plan, alias)
1716            }
1717            _ => LogicalPlanBuilder::empty(true)
1718                .build()
1719                .context(DataFusionPlanningSnafu),
1720        }
1721    }
1722
1723    /// Extract metric name from `__name__` matcher and set it into [PromPlannerContext].
1724    /// Returns a new [Matchers] that doesn't contain metric name matcher.
1725    ///
1726    /// Each call to this function means new selector is started. Thus, the context will be reset
1727    /// at first.
1728    ///
1729    /// Name rule:
1730    /// - if `name` is some, then the matchers MUST NOT contain `__name__` matcher.
1731    /// - if `name` is none, then the matchers MAY contain NONE OR MULTIPLE `__name__` matchers.
1732    #[allow(clippy::mutable_key_type)]
1733    fn preprocess_label_matchers(
1734        &mut self,
1735        label_matchers: &Matchers,
1736        name: &Option<String>,
1737    ) -> Result<Matchers> {
1738        self.ctx.reset();
1739
1740        let metric_name;
1741        if let Some(name) = name.clone() {
1742            metric_name = Some(name);
1743            ensure!(
1744                label_matchers.find_matchers(METRIC_NAME).is_empty(),
1745                MultipleMetricMatchersSnafu
1746            );
1747        } else {
1748            let mut matches = label_matchers.find_matchers(METRIC_NAME);
1749            ensure!(!matches.is_empty(), NoMetricMatcherSnafu);
1750            ensure!(matches.len() == 1, MultipleMetricMatchersSnafu);
1751            ensure!(
1752                matches[0].op == MatchOp::Equal,
1753                UnsupportedMatcherOpSnafu {
1754                    matcher_op: matches[0].op.to_string(),
1755                    matcher: METRIC_NAME
1756                }
1757            );
1758            metric_name = matches.pop().map(|m| m.value);
1759        }
1760
1761        self.ctx.table_name = metric_name;
1762
1763        let mut matchers = HashSet::new();
1764        for matcher in &label_matchers.matchers {
1765            // TODO(ruihang): support other metric match ops
1766            if matcher.name == FIELD_COLUMN_MATCHER {
1767                self.ctx
1768                    .field_column_matcher
1769                    .get_or_insert_default()
1770                    .push(matcher.clone());
1771            } else if matcher.name == SCHEMA_COLUMN_MATCHER || matcher.name == DB_COLUMN_MATCHER {
1772                ensure!(
1773                    matcher.op == MatchOp::Equal,
1774                    UnsupportedMatcherOpSnafu {
1775                        matcher: matcher.name.clone(),
1776                        matcher_op: matcher.op.to_string(),
1777                    }
1778                );
1779                self.ctx.schema_name = Some(matcher.value.clone());
1780            } else if matcher.name != METRIC_NAME {
1781                self.ctx.selector_matcher.push(matcher.clone());
1782                let _ = matchers.insert(matcher.clone());
1783            }
1784        }
1785
1786        Ok(Matchers::new(matchers.into_iter().collect()))
1787    }
1788
1789    async fn selector_to_series_normalize_plan(
1790        &mut self,
1791        offset: &Option<Offset>,
1792        label_matchers: Matchers,
1793        is_range_selector: bool,
1794    ) -> Result<LogicalPlan> {
1795        // make table scan plan
1796        let table_ref = self.table_ref()?;
1797        let mut table_scan = self.create_table_scan_plan(table_ref.clone()).await?;
1798        let table_schema = table_scan.schema();
1799
1800        // make filter exprs
1801        let offset_duration = match offset {
1802            Some(Offset::Pos(duration)) => duration.as_millis() as Millisecond,
1803            Some(Offset::Neg(duration)) => -(duration.as_millis() as Millisecond),
1804            None => 0,
1805        };
1806        let mut scan_filters = Self::matchers_to_expr(label_matchers.clone(), table_schema)?;
1807        if let Some(time_index_filter) = self.build_time_index_filter(offset_duration)? {
1808            scan_filters.push(time_index_filter);
1809        }
1810        table_scan = LogicalPlanBuilder::from(table_scan)
1811            .filter(conjunction(scan_filters).unwrap()) // Safety: `scan_filters` is not empty.
1812            .context(DataFusionPlanningSnafu)?
1813            .build()
1814            .context(DataFusionPlanningSnafu)?;
1815
1816        // make a projection plan if there is any `__field__` matcher
1817        if let Some(field_matchers) = &self.ctx.field_column_matcher {
1818            let col_set = self.ctx.field_columns.iter().collect::<HashSet<_>>();
1819            // opt-in set
1820            let mut result_set = HashSet::new();
1821            // opt-out set
1822            let mut reverse_set = HashSet::new();
1823            for matcher in field_matchers {
1824                match &matcher.op {
1825                    MatchOp::Equal => {
1826                        if col_set.contains(&matcher.value) {
1827                            let _ = result_set.insert(matcher.value.clone());
1828                        } else {
1829                            return Err(ColumnNotFoundSnafu {
1830                                col: matcher.value.clone(),
1831                            }
1832                            .build());
1833                        }
1834                    }
1835                    MatchOp::NotEqual => {
1836                        if col_set.contains(&matcher.value) {
1837                            let _ = reverse_set.insert(matcher.value.clone());
1838                        } else {
1839                            return Err(ColumnNotFoundSnafu {
1840                                col: matcher.value.clone(),
1841                            }
1842                            .build());
1843                        }
1844                    }
1845                    MatchOp::Re(regex) => {
1846                        for col in &self.ctx.field_columns {
1847                            if regex.is_match(col) {
1848                                let _ = result_set.insert(col.clone());
1849                            }
1850                        }
1851                    }
1852                    MatchOp::NotRe(regex) => {
1853                        for col in &self.ctx.field_columns {
1854                            if regex.is_match(col) {
1855                                let _ = reverse_set.insert(col.clone());
1856                            }
1857                        }
1858                    }
1859                }
1860            }
1861            // merge two set
1862            if result_set.is_empty() {
1863                result_set = col_set.into_iter().cloned().collect();
1864            }
1865            for col in reverse_set {
1866                let _ = result_set.remove(&col);
1867            }
1868
1869            // mask the field columns in context using computed result set
1870            self.ctx.field_columns = self
1871                .ctx
1872                .field_columns
1873                .drain(..)
1874                .filter(|col| result_set.contains(col))
1875                .collect();
1876
1877            let exprs = result_set
1878                .into_iter()
1879                .map(|col| DfExpr::Column(Column::new_unqualified(col)))
1880                .chain(self.create_tag_column_exprs()?)
1881                .chain(
1882                    self.ctx
1883                        .use_tsid
1884                        .then_some(DfExpr::Column(Column::new_unqualified(
1885                            DATA_SCHEMA_TSID_COLUMN_NAME,
1886                        ))),
1887                )
1888                .chain(Some(self.create_time_index_column_expr()?))
1889                .collect::<Vec<_>>();
1890
1891            // reuse this variable for simplicity
1892            table_scan = LogicalPlanBuilder::from(table_scan)
1893                .project(exprs)
1894                .context(DataFusionPlanningSnafu)?
1895                .build()
1896                .context(DataFusionPlanningSnafu)?;
1897        }
1898
1899        // make sort plan
1900        let series_key_columns = if self.ctx.use_tsid {
1901            vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]
1902        } else {
1903            self.ctx.tag_columns.clone()
1904        };
1905
1906        let sort_exprs = if self.ctx.use_tsid {
1907            vec![
1908                DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME)).sort(true, true),
1909                self.create_time_index_column_expr()?.sort(true, true),
1910            ]
1911        } else {
1912            self.create_tag_and_time_index_column_sort_exprs()?
1913        };
1914
1915        let sort_plan = LogicalPlanBuilder::from(table_scan)
1916            .sort(sort_exprs)
1917            .context(DataFusionPlanningSnafu)?
1918            .build()
1919            .context(DataFusionPlanningSnafu)?;
1920
1921        // make divide plan
1922        let time_index_column =
1923            self.ctx
1924                .time_index_column
1925                .clone()
1926                .with_context(|| TimeIndexNotFoundSnafu {
1927                    table: table_ref.to_string(),
1928                })?;
1929        let divide_plan = LogicalPlan::Extension(Extension {
1930            node: Arc::new(SeriesDivide::new(
1931                series_key_columns.clone(),
1932                time_index_column,
1933                sort_plan,
1934            )),
1935        });
1936
1937        // make series_normalize plan
1938        if !is_range_selector && offset_duration == 0 {
1939            return Ok(divide_plan);
1940        }
1941        let series_normalize = SeriesNormalize::new(
1942            offset_duration,
1943            self.ctx
1944                .time_index_column
1945                .clone()
1946                .with_context(|| TimeIndexNotFoundSnafu {
1947                    table: table_ref.to_quoted_string(),
1948                })?,
1949            is_range_selector,
1950            series_key_columns,
1951            divide_plan,
1952        );
1953        let logical_plan = LogicalPlan::Extension(Extension {
1954            node: Arc::new(series_normalize),
1955        });
1956
1957        Ok(logical_plan)
1958    }
1959
1960    /// Convert [LabelModifier] to [Column] exprs for aggregation.
1961    /// Timestamp column and tag columns will be included.
1962    ///
1963    /// # Side effect
1964    ///
1965    /// This method will also change the tag columns in ctx if `update_ctx` is true.
1966    fn agg_modifier_to_col(
1967        &mut self,
1968        input_schema: &DFSchemaRef,
1969        modifier: &Option<LabelModifier>,
1970        update_ctx: bool,
1971    ) -> Result<Vec<DfExpr>> {
1972        match modifier {
1973            None => {
1974                if update_ctx {
1975                    self.ctx.tag_columns.clear();
1976                }
1977                Ok(vec![self.create_time_index_column_expr()?])
1978            }
1979            Some(LabelModifier::Include(labels)) => {
1980                if update_ctx {
1981                    self.ctx.tag_columns.clear();
1982                }
1983                let mut exprs = Vec::with_capacity(labels.labels.len());
1984                for label in &labels.labels {
1985                    if is_metric_engine_internal_column(label) {
1986                        continue;
1987                    }
1988                    // nonexistence label will be ignored
1989                    if let Some(column_name) = Self::find_case_sensitive_column(input_schema, label)
1990                    {
1991                        exprs.push(DfExpr::Column(Column::from_name(column_name.clone())));
1992
1993                        if update_ctx {
1994                            // update the tag columns in context
1995                            self.ctx.tag_columns.push(column_name);
1996                        }
1997                    }
1998                }
1999                // add timestamp column
2000                exprs.push(self.create_time_index_column_expr()?);
2001
2002                Ok(exprs)
2003            }
2004            Some(LabelModifier::Exclude(labels)) => {
2005                let mut all_fields = input_schema
2006                    .fields()
2007                    .iter()
2008                    .map(|f| f.name())
2009                    .collect::<BTreeSet<_>>();
2010
2011                // Exclude metric engine internal columns (not PromQL labels) from the implicit
2012                // "without" label set.
2013                all_fields.retain(|col| !is_metric_engine_internal_column(col.as_str()));
2014
2015                // remove "without"-ed fields
2016                // nonexistence label will be ignored
2017                for label in &labels.labels {
2018                    let _ = all_fields.remove(label);
2019                }
2020
2021                // remove time index and value fields
2022                if let Some(time_index) = &self.ctx.time_index_column {
2023                    let _ = all_fields.remove(time_index);
2024                }
2025                for value in &self.ctx.field_columns {
2026                    let _ = all_fields.remove(value);
2027                }
2028
2029                if update_ctx {
2030                    // change the tag columns in context
2031                    self.ctx.tag_columns = all_fields.iter().map(|col| (*col).clone()).collect();
2032                }
2033
2034                // collect remaining fields and convert to col expr
2035                let mut exprs = all_fields
2036                    .into_iter()
2037                    .map(|c| DfExpr::Column(Column::from(c)))
2038                    .collect::<Vec<_>>();
2039
2040                // add timestamp column
2041                exprs.push(self.create_time_index_column_expr()?);
2042
2043                Ok(exprs)
2044            }
2045        }
2046    }
2047
2048    // TODO(ruihang): ignore `MetricNameLabel` (`__name__`) matcher
2049    pub fn matchers_to_expr(
2050        label_matchers: Matchers,
2051        table_schema: &DFSchemaRef,
2052    ) -> Result<Vec<DfExpr>> {
2053        let mut exprs = Vec::with_capacity(label_matchers.matchers.len());
2054        for matcher in label_matchers.matchers {
2055            if matcher.name == SCHEMA_COLUMN_MATCHER
2056                || matcher.name == DB_COLUMN_MATCHER
2057                || matcher.name == FIELD_COLUMN_MATCHER
2058            {
2059                continue;
2060            }
2061
2062            let column_name = Self::find_case_sensitive_column(table_schema, matcher.name.as_str());
2063            let col = if let Some(column_name) = column_name {
2064                DfExpr::Column(Column::from_name(column_name))
2065            } else {
2066                DfExpr::Literal(ScalarValue::Utf8(Some(String::new())), None)
2067                    .alias(matcher.name.clone())
2068            };
2069            let lit = DfExpr::Literal(ScalarValue::Utf8(Some(matcher.value)), None);
2070            let expr = match matcher.op {
2071                MatchOp::Equal => col.eq(lit),
2072                MatchOp::NotEqual => col.not_eq(lit),
2073                MatchOp::Re(re) => {
2074                    // TODO(ruihang): a more programmatic way to handle this in datafusion
2075
2076                    // This is a hack to handle `.+` and `.*`, and is not strictly correct
2077                    // `.` doesn't match newline (`\n`). Given this is in PromQL context,
2078                    // most of the time it's fine.
2079                    if re.as_str() == "^(?:.*)$" {
2080                        continue;
2081                    }
2082                    if re.as_str() == "^(?:.+)$" {
2083                        col.not_eq(DfExpr::Literal(
2084                            ScalarValue::Utf8(Some(String::new())),
2085                            None,
2086                        ))
2087                    } else {
2088                        DfExpr::BinaryExpr(BinaryExpr {
2089                            left: Box::new(col),
2090                            op: Operator::RegexMatch,
2091                            right: Box::new(DfExpr::Literal(
2092                                ScalarValue::Utf8(Some(re.as_str().to_string())),
2093                                None,
2094                            )),
2095                        })
2096                    }
2097                }
2098                MatchOp::NotRe(re) => {
2099                    if re.as_str() == "^(?:.*)$" {
2100                        DfExpr::Literal(ScalarValue::Boolean(Some(false)), None)
2101                    } else if re.as_str() == "^(?:.+)$" {
2102                        col.eq(DfExpr::Literal(
2103                            ScalarValue::Utf8(Some(String::new())),
2104                            None,
2105                        ))
2106                    } else {
2107                        DfExpr::BinaryExpr(BinaryExpr {
2108                            left: Box::new(col),
2109                            op: Operator::RegexNotMatch,
2110                            right: Box::new(DfExpr::Literal(
2111                                ScalarValue::Utf8(Some(re.as_str().to_string())),
2112                                None,
2113                            )),
2114                        })
2115                    }
2116                }
2117            };
2118            exprs.push(expr);
2119        }
2120
2121        Ok(exprs)
2122    }
2123
2124    fn find_case_sensitive_column(schema: &DFSchemaRef, column: &str) -> Option<String> {
2125        if is_metric_engine_internal_column(column) {
2126            return None;
2127        }
2128        schema
2129            .fields()
2130            .iter()
2131            .find(|field| field.name() == column)
2132            .map(|field| field.name().clone())
2133    }
2134
2135    fn table_from_source(&self, source: &Arc<dyn TableSource>) -> Result<table::TableRef> {
2136        Ok(source
2137            .as_any()
2138            .downcast_ref::<DefaultTableSource>()
2139            .context(UnknownTableSnafu)?
2140            .table_provider
2141            .as_any()
2142            .downcast_ref::<DfTableProviderAdapter>()
2143            .context(UnknownTableSnafu)?
2144            .table())
2145    }
2146
2147    fn table_ref(&self) -> Result<TableReference> {
2148        let table_name = self
2149            .ctx
2150            .table_name
2151            .clone()
2152            .context(TableNameNotFoundSnafu)?;
2153
2154        // set schema name if `__schema__` is given
2155        let table_ref = if let Some(schema_name) = &self.ctx.schema_name {
2156            TableReference::partial(schema_name.as_str(), table_name.as_str())
2157        } else {
2158            TableReference::bare(table_name.as_str())
2159        };
2160
2161        Ok(table_ref)
2162    }
2163
2164    fn build_time_index_filter(&self, offset_duration: i64) -> Result<Option<DfExpr>> {
2165        let start = self.ctx.start;
2166        let end = self.ctx.end;
2167        if end < start {
2168            return InvalidTimeRangeSnafu { start, end }.fail();
2169        }
2170        let lookback_delta = self.ctx.lookback_delta;
2171        let range = self.ctx.range.unwrap_or_default();
2172        let interval = self.ctx.interval;
2173        let time_index_expr = self.create_time_index_column_expr()?;
2174        let num_points = (end - start) / interval;
2175
2176        // Prometheus semantics:
2177        // - Instant selector lookback: (eval_ts - lookback_delta, eval_ts]
2178        // - Range selector:           (eval_ts - range, eval_ts]
2179        //
2180        // So samples positioned exactly at the lower boundary must be excluded. We align the scan
2181        // lower bound with Prometheus by shifting it forward by 1ms (millisecond granularity),
2182        // while still using a `>=` filter.
2183        let selector_window = if range == 0 { lookback_delta } else { range };
2184        let lower_exclusive_adjustment = if selector_window > 0 { 1 } else { 0 };
2185
2186        // Scan a continuous time range
2187        if (end - start) / interval > MAX_SCATTER_POINTS || interval <= INTERVAL_1H {
2188            let single_time_range = time_index_expr
2189                .clone()
2190                .gt_eq(DfExpr::Literal(
2191                    ScalarValue::TimestampMillisecond(
2192                        Some(
2193                            self.ctx.start - offset_duration - selector_window
2194                                + lower_exclusive_adjustment,
2195                        ),
2196                        None,
2197                    ),
2198                    None,
2199                ))
2200                .and(time_index_expr.lt_eq(DfExpr::Literal(
2201                    ScalarValue::TimestampMillisecond(Some(self.ctx.end - offset_duration), None),
2202                    None,
2203                )));
2204            return Ok(Some(single_time_range));
2205        }
2206
2207        // Otherwise scan scatter ranges separately
2208        let mut filters = Vec::with_capacity(num_points as usize + 1);
2209        for timestamp in (start..=end).step_by(interval as usize) {
2210            filters.push(
2211                time_index_expr
2212                    .clone()
2213                    .gt_eq(DfExpr::Literal(
2214                        ScalarValue::TimestampMillisecond(
2215                            Some(
2216                                timestamp - offset_duration - selector_window
2217                                    + lower_exclusive_adjustment,
2218                            ),
2219                            None,
2220                        ),
2221                        None,
2222                    ))
2223                    .and(time_index_expr.clone().lt_eq(DfExpr::Literal(
2224                        ScalarValue::TimestampMillisecond(Some(timestamp - offset_duration), None),
2225                        None,
2226                    ))),
2227            )
2228        }
2229
2230        Ok(filters.into_iter().reduce(DfExpr::or))
2231    }
2232
2233    /// Create a table scan plan and a filter plan with given filter.
2234    ///
2235    /// # Panic
2236    /// If the filter is empty
2237    async fn create_table_scan_plan(&mut self, table_ref: TableReference) -> Result<LogicalPlan> {
2238        let provider = self
2239            .table_provider
2240            .resolve_table(table_ref.clone())
2241            .await
2242            .context(CatalogSnafu)?;
2243
2244        let logical_table = self.table_from_source(&provider)?;
2245
2246        // Try to rewrite the table scan to physical table scan if possible.
2247        let mut maybe_phy_table_ref = table_ref.clone();
2248        let mut scan_provider = provider;
2249        let mut table_id_filter: Option<u32> = None;
2250
2251        // If it's a metric engine logical table, scan its physical table directly and filter by
2252        // `__table_id = logical_table_id` to get access to internal columns like `__tsid`.
2253        if logical_table.table_info().meta.engine == METRIC_ENGINE_NAME
2254            && let Some(physical_table_name) = logical_table
2255                .table_info()
2256                .meta
2257                .options
2258                .extra_options
2259                .get(LOGICAL_TABLE_METADATA_KEY)
2260        {
2261            let physical_table_ref = if let Some(schema_name) = &self.ctx.schema_name {
2262                TableReference::partial(schema_name.as_str(), physical_table_name.as_str())
2263            } else {
2264                TableReference::bare(physical_table_name.as_str())
2265            };
2266
2267            let physical_provider = match self
2268                .table_provider
2269                .resolve_table(physical_table_ref.clone())
2270                .await
2271            {
2272                Ok(provider) => provider,
2273                Err(e) if e.status_code() == StatusCode::TableNotFound => {
2274                    // Fall back to scanning the logical table. It still works, but without
2275                    // `__tsid` optimization.
2276                    scan_provider.clone()
2277                }
2278                Err(e) => return Err(e).context(CatalogSnafu),
2279            };
2280
2281            if !Arc::ptr_eq(&physical_provider, &scan_provider) {
2282                // Only rewrite when internal columns exist in physical schema.
2283                let physical_table = self.table_from_source(&physical_provider)?;
2284
2285                let has_table_id = physical_table
2286                    .schema()
2287                    .column_schema_by_name(DATA_SCHEMA_TABLE_ID_COLUMN_NAME)
2288                    .is_some();
2289                let has_tsid = physical_table
2290                    .schema()
2291                    .column_schema_by_name(DATA_SCHEMA_TSID_COLUMN_NAME)
2292                    .is_some_and(|col| matches!(col.data_type, ConcreteDataType::UInt64(_)));
2293
2294                if has_table_id && has_tsid {
2295                    scan_provider = physical_provider;
2296                    maybe_phy_table_ref = physical_table_ref;
2297                    table_id_filter = Some(logical_table.table_info().ident.table_id);
2298                }
2299            }
2300        }
2301
2302        let scan_table = self.table_from_source(&scan_provider)?;
2303
2304        let use_tsid = table_id_filter.is_some()
2305            && scan_table
2306                .schema()
2307                .column_schema_by_name(DATA_SCHEMA_TSID_COLUMN_NAME)
2308                .is_some_and(|col| matches!(col.data_type, ConcreteDataType::UInt64(_)));
2309        self.ctx.use_tsid = use_tsid;
2310
2311        let all_table_tags = self.ctx.tag_columns.clone();
2312
2313        let scan_tag_columns = if use_tsid {
2314            let mut scan_tags = self.ctx.tag_columns.clone();
2315            for matcher in &self.ctx.selector_matcher {
2316                if is_metric_engine_internal_column(&matcher.name) {
2317                    continue;
2318                }
2319                if all_table_tags.iter().any(|tag| tag == &matcher.name) {
2320                    scan_tags.push(matcher.name.clone());
2321                }
2322            }
2323            scan_tags.sort_unstable();
2324            scan_tags.dedup();
2325            scan_tags
2326        } else {
2327            self.ctx.tag_columns.clone()
2328        };
2329
2330        let is_time_index_ms = scan_table
2331            .schema()
2332            .timestamp_column()
2333            .with_context(|| TimeIndexNotFoundSnafu {
2334                table: maybe_phy_table_ref.to_quoted_string(),
2335            })?
2336            .data_type
2337            == ConcreteDataType::timestamp_millisecond_datatype();
2338
2339        let scan_projection = if table_id_filter.is_some() {
2340            let mut required_columns = HashSet::new();
2341            required_columns.insert(DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string());
2342            required_columns.insert(self.ctx.time_index_column.clone().with_context(|| {
2343                TimeIndexNotFoundSnafu {
2344                    table: maybe_phy_table_ref.to_quoted_string(),
2345                }
2346            })?);
2347            for col in &scan_tag_columns {
2348                required_columns.insert(col.clone());
2349            }
2350            for col in &self.ctx.field_columns {
2351                required_columns.insert(col.clone());
2352            }
2353            if use_tsid {
2354                required_columns.insert(DATA_SCHEMA_TSID_COLUMN_NAME.to_string());
2355            }
2356
2357            let arrow_schema = scan_table.schema().arrow_schema().clone();
2358            Some(
2359                arrow_schema
2360                    .fields()
2361                    .iter()
2362                    .enumerate()
2363                    .filter(|(_, field)| required_columns.contains(field.name().as_str()))
2364                    .map(|(idx, _)| idx)
2365                    .collect::<Vec<_>>(),
2366            )
2367        } else {
2368            None
2369        };
2370
2371        let mut scan_plan =
2372            LogicalPlanBuilder::scan(maybe_phy_table_ref.clone(), scan_provider, scan_projection)
2373                .context(DataFusionPlanningSnafu)?
2374                .build()
2375                .context(DataFusionPlanningSnafu)?;
2376
2377        if let Some(table_id) = table_id_filter {
2378            scan_plan = LogicalPlanBuilder::from(scan_plan)
2379                .filter(
2380                    DfExpr::Column(Column::from_name(DATA_SCHEMA_TABLE_ID_COLUMN_NAME))
2381                        .eq(lit(table_id)),
2382                )
2383                .context(DataFusionPlanningSnafu)?
2384                .alias(table_ref.clone()) // rename the relation back to logical table's name after filtering
2385                .context(DataFusionPlanningSnafu)?
2386                .build()
2387                .context(DataFusionPlanningSnafu)?;
2388        }
2389
2390        if !is_time_index_ms {
2391            // cast to ms if time_index not in Millisecond precision
2392            let expr: Vec<_> = self
2393                .create_field_column_exprs()?
2394                .into_iter()
2395                .chain(
2396                    scan_tag_columns
2397                        .iter()
2398                        .map(|tag| DfExpr::Column(Column::from_name(tag))),
2399                )
2400                .chain(self.ctx.use_tsid.then_some(DfExpr::Column(Column::new(
2401                    Some(table_ref.clone()),
2402                    DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
2403                ))))
2404                .chain(Some(DfExpr::Alias(Alias {
2405                    expr: Box::new(DfExpr::Cast(Cast {
2406                        expr: Box::new(self.create_time_index_column_expr()?),
2407                        data_type: ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None),
2408                    })),
2409                    relation: Some(table_ref.clone()),
2410                    name: self
2411                        .ctx
2412                        .time_index_column
2413                        .as_ref()
2414                        .with_context(|| TimeIndexNotFoundSnafu {
2415                            table: table_ref.to_quoted_string(),
2416                        })?
2417                        .clone(),
2418                    metadata: None,
2419                })))
2420                .collect::<Vec<_>>();
2421            scan_plan = LogicalPlanBuilder::from(scan_plan)
2422                .project(expr)
2423                .context(DataFusionPlanningSnafu)?
2424                .build()
2425                .context(DataFusionPlanningSnafu)?;
2426        } else if table_id_filter.is_some() {
2427            // Drop the internal `__table_id` column after filtering.
2428            let project_exprs = self
2429                .create_field_column_exprs()?
2430                .into_iter()
2431                .chain(
2432                    scan_tag_columns
2433                        .iter()
2434                        .map(|tag| DfExpr::Column(Column::from_name(tag))),
2435                )
2436                .chain(
2437                    self.ctx
2438                        .use_tsid
2439                        .then_some(DfExpr::Column(Column::from_name(
2440                            DATA_SCHEMA_TSID_COLUMN_NAME,
2441                        ))),
2442                )
2443                .chain(Some(self.create_time_index_column_expr()?))
2444                .collect::<Vec<_>>();
2445
2446            scan_plan = LogicalPlanBuilder::from(scan_plan)
2447                .project(project_exprs)
2448                .context(DataFusionPlanningSnafu)?
2449                .build()
2450                .context(DataFusionPlanningSnafu)?;
2451        }
2452
2453        let result = LogicalPlanBuilder::from(scan_plan)
2454            .build()
2455            .context(DataFusionPlanningSnafu)?;
2456        Ok(result)
2457    }
2458
2459    fn collect_row_key_tag_columns_from_plan(
2460        &self,
2461        plan: &LogicalPlan,
2462    ) -> Result<BTreeSet<String>> {
2463        fn walk(
2464            planner: &PromPlanner,
2465            plan: &LogicalPlan,
2466            out: &mut BTreeSet<String>,
2467        ) -> Result<()> {
2468            if let LogicalPlan::TableScan(scan) = plan {
2469                let table = planner.table_from_source(&scan.source)?;
2470                for col in table.table_info().meta.row_key_column_names() {
2471                    if col != DATA_SCHEMA_TABLE_ID_COLUMN_NAME
2472                        && col != DATA_SCHEMA_TSID_COLUMN_NAME
2473                        && !is_metric_engine_internal_column(col)
2474                    {
2475                        out.insert(col.clone());
2476                    }
2477                }
2478            }
2479
2480            for input in plan.inputs() {
2481                walk(planner, input, out)?;
2482            }
2483            Ok(())
2484        }
2485
2486        let mut out = BTreeSet::new();
2487        walk(self, plan, &mut out)?;
2488        Ok(out)
2489    }
2490
2491    fn ensure_tag_columns_available(
2492        &self,
2493        plan: LogicalPlan,
2494        required_tags: &BTreeSet<String>,
2495    ) -> Result<LogicalPlan> {
2496        if required_tags.is_empty() {
2497            return Ok(plan);
2498        }
2499
2500        struct Rewriter {
2501            required_tags: BTreeSet<String>,
2502        }
2503
2504        impl TreeNodeRewriter for Rewriter {
2505            type Node = LogicalPlan;
2506
2507            fn f_up(
2508                &mut self,
2509                node: Self::Node,
2510            ) -> datafusion_common::Result<Transformed<Self::Node>> {
2511                match node {
2512                    LogicalPlan::TableScan(scan) => {
2513                        let schema = scan.source.schema();
2514                        let mut projection = match scan.projection.clone() {
2515                            Some(p) => p,
2516                            None => {
2517                                // Scanning all columns already covers required tags.
2518                                return Ok(Transformed::no(LogicalPlan::TableScan(scan)));
2519                            }
2520                        };
2521
2522                        let mut changed = false;
2523                        for tag in &self.required_tags {
2524                            if let Some((idx, _)) = schema
2525                                .fields()
2526                                .iter()
2527                                .enumerate()
2528                                .find(|(_, field)| field.name() == tag)
2529                                && !projection.contains(&idx)
2530                            {
2531                                projection.push(idx);
2532                                changed = true;
2533                            }
2534                        }
2535
2536                        if !changed {
2537                            return Ok(Transformed::no(LogicalPlan::TableScan(scan)));
2538                        }
2539
2540                        projection.sort_unstable();
2541                        projection.dedup();
2542
2543                        let new_scan = TableScan::try_new(
2544                            scan.table_name.clone(),
2545                            scan.source.clone(),
2546                            Some(projection),
2547                            scan.filters,
2548                            scan.fetch,
2549                        )?;
2550                        Ok(Transformed::yes(LogicalPlan::TableScan(new_scan)))
2551                    }
2552                    LogicalPlan::Projection(proj) => {
2553                        let input_schema = proj.input.schema();
2554
2555                        let existing = proj
2556                            .schema
2557                            .fields()
2558                            .iter()
2559                            .map(|f| f.name().as_str())
2560                            .collect::<HashSet<_>>();
2561
2562                        let mut expr = proj.expr.clone();
2563                        let mut has_changed = false;
2564                        for tag in &self.required_tags {
2565                            if existing.contains(tag.as_str()) {
2566                                continue;
2567                            }
2568
2569                            if let Some(idx) = input_schema.index_of_column_by_name(None, tag) {
2570                                expr.push(DfExpr::Column(Column::from(
2571                                    input_schema.qualified_field(idx),
2572                                )));
2573                                has_changed = true;
2574                            }
2575                        }
2576
2577                        if !has_changed {
2578                            return Ok(Transformed::no(LogicalPlan::Projection(proj)));
2579                        }
2580
2581                        let new_proj = Projection::try_new(expr, proj.input)?;
2582                        Ok(Transformed::yes(LogicalPlan::Projection(new_proj)))
2583                    }
2584                    other => Ok(Transformed::no(other)),
2585                }
2586            }
2587        }
2588
2589        let mut rewriter = Rewriter {
2590            required_tags: required_tags.clone(),
2591        };
2592        let rewritten = plan
2593            .rewrite(&mut rewriter)
2594            .context(DataFusionPlanningSnafu)?;
2595        Ok(rewritten.data)
2596    }
2597
2598    fn refresh_tag_columns_from_schema(&mut self, schema: &DFSchemaRef) {
2599        let time_index = self.ctx.time_index_column.as_deref();
2600        let field_columns = self.ctx.field_columns.iter().collect::<HashSet<_>>();
2601
2602        let mut tags = schema
2603            .fields()
2604            .iter()
2605            .map(|f| f.name())
2606            .filter(|name| Some(name.as_str()) != time_index)
2607            .filter(|name| !field_columns.contains(name))
2608            .filter(|name| !is_metric_engine_internal_column(name))
2609            .cloned()
2610            .collect::<Vec<_>>();
2611        tags.sort_unstable();
2612        tags.dedup();
2613        self.ctx.tag_columns = tags;
2614    }
2615
2616    /// Setup [PromPlannerContext]'s state fields.
2617    ///
2618    /// Returns a logical plan for an empty metric.
2619    async fn setup_context(&mut self) -> Result<Option<LogicalPlan>> {
2620        let table_ref = self.table_ref()?;
2621        let source = match self.table_provider.resolve_table(table_ref.clone()).await {
2622            Err(e) if e.status_code() == StatusCode::TableNotFound => {
2623                let plan = self.setup_context_for_empty_metric()?;
2624                return Ok(Some(plan));
2625            }
2626            res => res.context(CatalogSnafu)?,
2627        };
2628        let table = self.table_from_source(&source)?;
2629
2630        // set time index column name
2631        let time_index = table
2632            .schema()
2633            .timestamp_column()
2634            .with_context(|| TimeIndexNotFoundSnafu {
2635                table: table_ref.to_quoted_string(),
2636            })?
2637            .name
2638            .clone();
2639        self.ctx.time_index_column = Some(time_index);
2640
2641        // set values columns
2642        let values = table
2643            .table_info()
2644            .meta
2645            .field_column_names()
2646            .cloned()
2647            .collect();
2648        self.ctx.field_columns = values;
2649
2650        // set primary key (tag) columns
2651        let tags = table
2652            .table_info()
2653            .meta
2654            .row_key_column_names()
2655            .filter(|col| {
2656                // remove metric engine's internal columns
2657                col != &DATA_SCHEMA_TABLE_ID_COLUMN_NAME && col != &DATA_SCHEMA_TSID_COLUMN_NAME
2658            })
2659            .cloned()
2660            .collect();
2661        self.ctx.tag_columns = tags;
2662
2663        self.ctx.use_tsid = false;
2664
2665        Ok(None)
2666    }
2667
2668    /// Setup [PromPlannerContext]'s state fields for a non existent table
2669    /// without any rows.
2670    fn setup_context_for_empty_metric(&mut self) -> Result<LogicalPlan> {
2671        self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
2672        self.ctx.reset_table_name_and_schema();
2673        self.ctx.tag_columns = vec![];
2674        self.ctx.field_columns = vec![DEFAULT_FIELD_COLUMN.to_string()];
2675        self.ctx.use_tsid = false;
2676
2677        // The table doesn't have any data, so we set start to 0 and end to -1.
2678        let plan = LogicalPlan::Extension(Extension {
2679            node: Arc::new(
2680                EmptyMetric::new(
2681                    0,
2682                    -1,
2683                    self.ctx.interval,
2684                    SPECIAL_TIME_FUNCTION.to_string(),
2685                    DEFAULT_FIELD_COLUMN.to_string(),
2686                    Some(lit(0.0f64)),
2687                )
2688                .context(DataFusionPlanningSnafu)?,
2689            ),
2690        });
2691        Ok(plan)
2692    }
2693
2694    // TODO(ruihang): insert column expr
2695    fn create_function_args(&self, args: &[Box<PromExpr>]) -> Result<FunctionArgs> {
2696        let mut result = FunctionArgs::default();
2697
2698        for arg in args {
2699            // First try to parse as literal expression (including binary expressions like 100.0 + 3.0)
2700            if let Some(expr) = Self::try_build_literal_expr(arg) {
2701                result.literals.push(expr);
2702            } else {
2703                // If not a literal, treat as vector input
2704                match arg.as_ref() {
2705                    PromExpr::Subquery(_)
2706                    | PromExpr::VectorSelector(_)
2707                    | PromExpr::MatrixSelector(_)
2708                    | PromExpr::Extension(_)
2709                    | PromExpr::Aggregate(_)
2710                    | PromExpr::Paren(_)
2711                    | PromExpr::Call(_)
2712                    | PromExpr::Binary(_)
2713                    | PromExpr::Unary(_) => {
2714                        if result.input.replace(*arg.clone()).is_some() {
2715                            MultipleVectorSnafu { expr: *arg.clone() }.fail()?;
2716                        }
2717                    }
2718
2719                    _ => {
2720                        let expr = Self::get_param_as_literal_expr(&Some(arg.clone()), None, None)?;
2721                        result.literals.push(expr);
2722                    }
2723                }
2724            }
2725        }
2726
2727        Ok(result)
2728    }
2729
2730    /// Creates function expressions for projection and returns the expressions and new tags.
2731    ///
2732    /// # Side Effects
2733    ///
2734    /// This method will update [PromPlannerContext]'s fields and tags if needed.
2735    fn create_function_expr(
2736        &mut self,
2737        func: &Function,
2738        other_input_exprs: Vec<DfExpr>,
2739        query_engine_state: &QueryEngineState,
2740    ) -> Result<(Vec<DfExpr>, Vec<String>)> {
2741        // TODO(ruihang): check function args list
2742        let mut other_input_exprs: VecDeque<DfExpr> = other_input_exprs.into();
2743
2744        // TODO(ruihang): set this according to in-param list
2745        let field_column_pos = 0;
2746        let mut exprs = Vec::with_capacity(self.ctx.field_columns.len());
2747        // New labels after executing the function, e.g. `label_replace` etc.
2748        let mut new_tags = vec![];
2749        let scalar_func = match func.name {
2750            "increase" => ScalarFunc::ExtrapolateUdf(
2751                Arc::new(Increase::scalar_udf()),
2752                self.ctx.range.context(ExpectRangeSelectorSnafu)?,
2753            ),
2754            "rate" => ScalarFunc::ExtrapolateUdf(
2755                Arc::new(Rate::scalar_udf()),
2756                self.ctx.range.context(ExpectRangeSelectorSnafu)?,
2757            ),
2758            "delta" => ScalarFunc::ExtrapolateUdf(
2759                Arc::new(Delta::scalar_udf()),
2760                self.ctx.range.context(ExpectRangeSelectorSnafu)?,
2761            ),
2762            "idelta" => ScalarFunc::Udf(Arc::new(IDelta::<false>::scalar_udf())),
2763            "irate" => ScalarFunc::Udf(Arc::new(IDelta::<true>::scalar_udf())),
2764            "resets" => ScalarFunc::Udf(Arc::new(Resets::scalar_udf())),
2765            "changes" => ScalarFunc::Udf(Arc::new(Changes::scalar_udf())),
2766            "deriv" => ScalarFunc::Udf(Arc::new(Deriv::scalar_udf())),
2767            "avg_over_time" => ScalarFunc::Udf(Arc::new(AvgOverTime::scalar_udf())),
2768            "min_over_time" => ScalarFunc::Udf(Arc::new(MinOverTime::scalar_udf())),
2769            "max_over_time" => ScalarFunc::Udf(Arc::new(MaxOverTime::scalar_udf())),
2770            "sum_over_time" => ScalarFunc::Udf(Arc::new(SumOverTime::scalar_udf())),
2771            "count_over_time" => ScalarFunc::Udf(Arc::new(CountOverTime::scalar_udf())),
2772            "last_over_time" => ScalarFunc::Udf(Arc::new(LastOverTime::scalar_udf())),
2773            "absent_over_time" => ScalarFunc::Udf(Arc::new(AbsentOverTime::scalar_udf())),
2774            "present_over_time" => ScalarFunc::Udf(Arc::new(PresentOverTime::scalar_udf())),
2775            "stddev_over_time" => ScalarFunc::Udf(Arc::new(StddevOverTime::scalar_udf())),
2776            "stdvar_over_time" => ScalarFunc::Udf(Arc::new(StdvarOverTime::scalar_udf())),
2777            "quantile_over_time" => ScalarFunc::Udf(Arc::new(QuantileOverTime::scalar_udf())),
2778            "predict_linear" => {
2779                other_input_exprs[0] = DfExpr::Cast(Cast {
2780                    expr: Box::new(other_input_exprs[0].clone()),
2781                    data_type: ArrowDataType::Int64,
2782                });
2783                ScalarFunc::Udf(Arc::new(PredictLinear::scalar_udf()))
2784            }
2785            "double_exponential_smoothing" | "holt_winters" => {
2786                ScalarFunc::Udf(Arc::new(DoubleExponentialSmoothing::scalar_udf()))
2787            }
2788            "time" => {
2789                exprs.push(build_special_time_expr(
2790                    self.ctx.time_index_column.as_ref().unwrap(),
2791                ));
2792                ScalarFunc::GeneratedExpr
2793            }
2794            "minute" => {
2795                // date_part('minute', time_index)
2796                let expr = self.date_part_on_time_index("minute")?;
2797                exprs.push(expr);
2798                ScalarFunc::GeneratedExpr
2799            }
2800            "hour" => {
2801                // date_part('hour', time_index)
2802                let expr = self.date_part_on_time_index("hour")?;
2803                exprs.push(expr);
2804                ScalarFunc::GeneratedExpr
2805            }
2806            "month" => {
2807                // date_part('month', time_index)
2808                let expr = self.date_part_on_time_index("month")?;
2809                exprs.push(expr);
2810                ScalarFunc::GeneratedExpr
2811            }
2812            "year" => {
2813                // date_part('year', time_index)
2814                let expr = self.date_part_on_time_index("year")?;
2815                exprs.push(expr);
2816                ScalarFunc::GeneratedExpr
2817            }
2818            "day_of_month" => {
2819                // date_part('day', time_index)
2820                let expr = self.date_part_on_time_index("day")?;
2821                exprs.push(expr);
2822                ScalarFunc::GeneratedExpr
2823            }
2824            "day_of_week" => {
2825                // date_part('dow', time_index)
2826                let expr = self.date_part_on_time_index("dow")?;
2827                exprs.push(expr);
2828                ScalarFunc::GeneratedExpr
2829            }
2830            "day_of_year" => {
2831                // date_part('doy', time_index)
2832                let expr = self.date_part_on_time_index("doy")?;
2833                exprs.push(expr);
2834                ScalarFunc::GeneratedExpr
2835            }
2836            "days_in_month" => {
2837                // date_part(
2838                //     'days',
2839                //     (date_trunc('month', <TIME INDEX>::date) + interval '1 month - 1 day')
2840                // );
2841                let day_lit_expr = "day".lit();
2842                let month_lit_expr = "month".lit();
2843                let interval_1month_lit_expr =
2844                    DfExpr::Literal(ScalarValue::IntervalYearMonth(Some(1)), None);
2845                let interval_1day_lit_expr = DfExpr::Literal(
2846                    ScalarValue::IntervalDayTime(Some(IntervalDayTime::new(1, 0))),
2847                    None,
2848                );
2849                let the_1month_minus_1day_expr = DfExpr::BinaryExpr(BinaryExpr {
2850                    left: Box::new(interval_1month_lit_expr),
2851                    op: Operator::Minus,
2852                    right: Box::new(interval_1day_lit_expr),
2853                });
2854                let date_trunc_expr = DfExpr::ScalarFunction(ScalarFunction {
2855                    func: datafusion_functions::datetime::date_trunc(),
2856                    args: vec![month_lit_expr, self.create_time_index_column_expr()?],
2857                });
2858                let date_trunc_plus_interval_expr = DfExpr::BinaryExpr(BinaryExpr {
2859                    left: Box::new(date_trunc_expr),
2860                    op: Operator::Plus,
2861                    right: Box::new(the_1month_minus_1day_expr),
2862                });
2863                let date_part_expr = DfExpr::ScalarFunction(ScalarFunction {
2864                    func: datafusion_functions::datetime::date_part(),
2865                    args: vec![day_lit_expr, date_trunc_plus_interval_expr],
2866                });
2867
2868                exprs.push(date_part_expr);
2869                ScalarFunc::GeneratedExpr
2870            }
2871
2872            "label_join" => {
2873                self.ctx.use_tsid = false;
2874                let (concat_expr, dst_label) = Self::build_concat_labels_expr(
2875                    &mut other_input_exprs,
2876                    &self.ctx,
2877                    query_engine_state,
2878                )?;
2879
2880                // Reserve the current field columns except the `dst_label`.
2881                for value in &self.ctx.field_columns {
2882                    if *value != dst_label {
2883                        let expr = DfExpr::Column(Column::from_name(value));
2884                        exprs.push(expr);
2885                    }
2886                }
2887
2888                // Remove it from tag columns if exists to avoid duplicated column names
2889                self.ctx.tag_columns.retain(|tag| *tag != dst_label);
2890                new_tags.push(dst_label);
2891                // Add the new label expr to evaluate
2892                exprs.push(concat_expr);
2893
2894                ScalarFunc::GeneratedExpr
2895            }
2896            "label_replace" => {
2897                self.ctx.use_tsid = false;
2898                if let Some((replace_expr, dst_label)) = self
2899                    .build_regexp_replace_label_expr(&mut other_input_exprs, query_engine_state)?
2900                {
2901                    // Reserve the current field columns except the `dst_label`.
2902                    for value in &self.ctx.field_columns {
2903                        if *value != dst_label {
2904                            let expr = DfExpr::Column(Column::from_name(value));
2905                            exprs.push(expr);
2906                        }
2907                    }
2908
2909                    ensure!(
2910                        !self.ctx.tag_columns.contains(&dst_label),
2911                        SameLabelSetSnafu
2912                    );
2913                    new_tags.push(dst_label);
2914                    // Add the new label expr to evaluate
2915                    exprs.push(replace_expr);
2916                } else {
2917                    // Keep the current field columns
2918                    for value in &self.ctx.field_columns {
2919                        let expr = DfExpr::Column(Column::from_name(value));
2920                        exprs.push(expr);
2921                    }
2922                }
2923
2924                ScalarFunc::GeneratedExpr
2925            }
2926            "sort" | "sort_desc" | "sort_by_label" | "sort_by_label_desc" | "timestamp" => {
2927                // These functions are not expression but a part of plan,
2928                // they are processed by `prom_call_expr_to_plan`.
2929                for value in &self.ctx.field_columns {
2930                    let expr = DfExpr::Column(Column::from_name(value));
2931                    exprs.push(expr);
2932                }
2933
2934                ScalarFunc::GeneratedExpr
2935            }
2936            "round" => {
2937                if other_input_exprs.is_empty() {
2938                    other_input_exprs.push_front(0.0f64.lit());
2939                }
2940                ScalarFunc::DataFusionUdf(Arc::new(Round::scalar_udf()))
2941            }
2942            "rad" => ScalarFunc::DataFusionBuiltin(datafusion::functions::math::radians()),
2943            "deg" => ScalarFunc::DataFusionBuiltin(datafusion::functions::math::degrees()),
2944            "sgn" => ScalarFunc::DataFusionBuiltin(datafusion::functions::math::signum()),
2945            "pi" => {
2946                // pi functions doesn't accepts any arguments, needs special processing
2947                let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
2948                    func: datafusion::functions::math::pi(),
2949                    args: vec![],
2950                });
2951                exprs.push(fn_expr);
2952
2953                ScalarFunc::GeneratedExpr
2954            }
2955            _ => {
2956                if let Some(f) = query_engine_state
2957                    .session_state()
2958                    .scalar_functions()
2959                    .get(func.name)
2960                {
2961                    ScalarFunc::DataFusionBuiltin(f.clone())
2962                } else if let Some(factory) = query_engine_state.scalar_function(func.name) {
2963                    let func_state = query_engine_state.function_state();
2964                    let query_ctx = self.table_provider.query_ctx();
2965
2966                    ScalarFunc::DataFusionUdf(Arc::new(factory.provide(FunctionContext {
2967                        state: func_state,
2968                        query_ctx: query_ctx.clone(),
2969                    })))
2970                } else if let Some(f) = datafusion_functions::math::functions()
2971                    .iter()
2972                    .find(|f| f.name() == func.name)
2973                {
2974                    ScalarFunc::DataFusionUdf(f.clone())
2975                } else {
2976                    return UnsupportedExprSnafu {
2977                        name: func.name.to_string(),
2978                    }
2979                    .fail();
2980                }
2981            }
2982        };
2983
2984        for value in &self.ctx.field_columns {
2985            let col_expr = DfExpr::Column(Column::from_name(value));
2986
2987            match scalar_func.clone() {
2988                ScalarFunc::DataFusionBuiltin(func) => {
2989                    other_input_exprs.insert(field_column_pos, col_expr);
2990                    let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
2991                        func,
2992                        args: other_input_exprs.clone().into(),
2993                    });
2994                    exprs.push(fn_expr);
2995                    let _ = other_input_exprs.remove(field_column_pos);
2996                }
2997                ScalarFunc::DataFusionUdf(func) => {
2998                    let args = itertools::chain!(
2999                        other_input_exprs.iter().take(field_column_pos).cloned(),
3000                        std::iter::once(col_expr),
3001                        other_input_exprs.iter().skip(field_column_pos).cloned()
3002                    )
3003                    .collect_vec();
3004                    exprs.push(DfExpr::ScalarFunction(ScalarFunction { func, args }))
3005                }
3006                ScalarFunc::Udf(func) => {
3007                    let ts_range_expr = DfExpr::Column(Column::from_name(
3008                        RangeManipulate::build_timestamp_range_name(
3009                            self.ctx.time_index_column.as_ref().unwrap(),
3010                        ),
3011                    ));
3012                    other_input_exprs.insert(field_column_pos, ts_range_expr);
3013                    other_input_exprs.insert(field_column_pos + 1, col_expr);
3014                    let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
3015                        func,
3016                        args: other_input_exprs.clone().into(),
3017                    });
3018                    exprs.push(fn_expr);
3019                    let _ = other_input_exprs.remove(field_column_pos + 1);
3020                    let _ = other_input_exprs.remove(field_column_pos);
3021                }
3022                ScalarFunc::ExtrapolateUdf(func, range_length) => {
3023                    let ts_range_expr = DfExpr::Column(Column::from_name(
3024                        RangeManipulate::build_timestamp_range_name(
3025                            self.ctx.time_index_column.as_ref().unwrap(),
3026                        ),
3027                    ));
3028                    other_input_exprs.insert(field_column_pos, ts_range_expr);
3029                    other_input_exprs.insert(field_column_pos + 1, col_expr);
3030                    other_input_exprs
3031                        .insert(field_column_pos + 2, self.create_time_index_column_expr()?);
3032                    other_input_exprs.push_back(lit(range_length));
3033                    let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
3034                        func,
3035                        args: other_input_exprs.clone().into(),
3036                    });
3037                    exprs.push(fn_expr);
3038                    let _ = other_input_exprs.pop_back();
3039                    let _ = other_input_exprs.remove(field_column_pos + 2);
3040                    let _ = other_input_exprs.remove(field_column_pos + 1);
3041                    let _ = other_input_exprs.remove(field_column_pos);
3042                }
3043                ScalarFunc::GeneratedExpr => {}
3044            }
3045        }
3046
3047        // Update value columns' name, and alias them to remove qualifiers
3048        // For label functions such as `label_join`, `label_replace`, etc.,
3049        // we keep the fields unchanged.
3050        if !matches!(func.name, "label_join" | "label_replace") {
3051            let mut new_field_columns = Vec::with_capacity(exprs.len());
3052
3053            exprs = exprs
3054                .into_iter()
3055                .map(|expr| {
3056                    let display_name = expr.schema_name().to_string();
3057                    new_field_columns.push(display_name.clone());
3058                    Ok(expr.alias(display_name))
3059                })
3060                .collect::<std::result::Result<Vec<_>, _>>()
3061                .context(DataFusionPlanningSnafu)?;
3062
3063            self.ctx.field_columns = new_field_columns;
3064        }
3065
3066        Ok((exprs, new_tags))
3067    }
3068
3069    /// Validate label name according to Prometheus specification.
3070    /// Label names must match the regex: [a-zA-Z_][a-zA-Z0-9_]*
3071    /// Additionally, label names starting with double underscores are reserved for internal use.
3072    fn validate_label_name(label_name: &str) -> Result<()> {
3073        // Check if label name starts with double underscores (reserved)
3074        if label_name.starts_with("__") {
3075            return InvalidDestinationLabelNameSnafu { label_name }.fail();
3076        }
3077        // Check if label name matches the required pattern
3078        if !LABEL_NAME_REGEX.is_match(label_name) {
3079            return InvalidDestinationLabelNameSnafu { label_name }.fail();
3080        }
3081
3082        Ok(())
3083    }
3084
3085    /// Build expr for `label_replace` function
3086    fn build_regexp_replace_label_expr(
3087        &self,
3088        other_input_exprs: &mut VecDeque<DfExpr>,
3089        query_engine_state: &QueryEngineState,
3090    ) -> Result<Option<(DfExpr, String)>> {
3091        // label_replace(vector, dst_label, replacement, src_label, regex)
3092        let dst_label = match other_input_exprs.pop_front() {
3093            Some(DfExpr::Literal(ScalarValue::Utf8(Some(d)), _)) => d,
3094            other => UnexpectedPlanExprSnafu {
3095                desc: format!("expected dst_label string literal, but found {:?}", other),
3096            }
3097            .fail()?,
3098        };
3099
3100        // Validate the destination label name
3101        Self::validate_label_name(&dst_label)?;
3102        let replacement = match other_input_exprs.pop_front() {
3103            Some(DfExpr::Literal(ScalarValue::Utf8(Some(r)), _)) => r,
3104            other => UnexpectedPlanExprSnafu {
3105                desc: format!("expected replacement string literal, but found {:?}", other),
3106            }
3107            .fail()?,
3108        };
3109        let src_label = match other_input_exprs.pop_front() {
3110            Some(DfExpr::Literal(ScalarValue::Utf8(Some(s)), None)) => s,
3111            other => UnexpectedPlanExprSnafu {
3112                desc: format!("expected src_label string literal, but found {:?}", other),
3113            }
3114            .fail()?,
3115        };
3116
3117        let regex = match other_input_exprs.pop_front() {
3118            Some(DfExpr::Literal(ScalarValue::Utf8(Some(r)), None)) => r,
3119            other => UnexpectedPlanExprSnafu {
3120                desc: format!("expected regex string literal, but found {:?}", other),
3121            }
3122            .fail()?,
3123        };
3124
3125        // Validate the regex before using it
3126        // doc: https://prometheus.io/docs/prometheus/latest/querying/functions/#label_replace
3127        regex::Regex::new(&regex).map_err(|_| {
3128            InvalidRegularExpressionSnafu {
3129                regex: regex.clone(),
3130            }
3131            .build()
3132        })?;
3133
3134        // If the src_label exists and regex is empty, keep everything unchanged.
3135        if self.ctx.tag_columns.contains(&src_label) && regex.is_empty() {
3136            return Ok(None);
3137        }
3138
3139        // If the src_label doesn't exists, and
3140        if !self.ctx.tag_columns.contains(&src_label) {
3141            if replacement.is_empty() {
3142                // the replacement is empty, keep everything unchanged.
3143                return Ok(None);
3144            } else {
3145                // the replacement is not empty, always adds dst_label with replacement value.
3146                return Ok(Some((
3147                    // alias literal `replacement` as dst_label
3148                    lit(replacement).alias(&dst_label),
3149                    dst_label,
3150                )));
3151            }
3152        }
3153
3154        // Preprocess the regex:
3155        // https://github.com/prometheus/prometheus/blob/d902abc50d6652ba8fe9a81ff8e5cce936114eba/promql/functions.go#L1575C32-L1575C37
3156        let regex = format!("^(?s:{regex})$");
3157
3158        let session_state = query_engine_state.session_state();
3159        let func = session_state
3160            .scalar_functions()
3161            .get("regexp_replace")
3162            .context(UnsupportedExprSnafu {
3163                name: "regexp_replace",
3164            })?;
3165
3166        // regexp_replace(src_label, regex, replacement)
3167        let args = vec![
3168            if src_label.is_empty() {
3169                DfExpr::Literal(ScalarValue::Utf8(Some(String::new())), None)
3170            } else {
3171                DfExpr::Column(Column::from_name(src_label))
3172            },
3173            DfExpr::Literal(ScalarValue::Utf8(Some(regex)), None),
3174            DfExpr::Literal(ScalarValue::Utf8(Some(replacement)), None),
3175        ];
3176
3177        Ok(Some((
3178            DfExpr::ScalarFunction(ScalarFunction {
3179                func: func.clone(),
3180                args,
3181            })
3182            .alias(&dst_label),
3183            dst_label,
3184        )))
3185    }
3186
3187    /// Build expr for `label_join` function
3188    fn build_concat_labels_expr(
3189        other_input_exprs: &mut VecDeque<DfExpr>,
3190        ctx: &PromPlannerContext,
3191        query_engine_state: &QueryEngineState,
3192    ) -> Result<(DfExpr, String)> {
3193        // label_join(vector, dst_label, separator, src_label_1, src_label_2, ...)
3194
3195        let dst_label = match other_input_exprs.pop_front() {
3196            Some(DfExpr::Literal(ScalarValue::Utf8(Some(d)), _)) => d,
3197            other => UnexpectedPlanExprSnafu {
3198                desc: format!("expected dst_label string literal, but found {:?}", other),
3199            }
3200            .fail()?,
3201        };
3202        let separator = match other_input_exprs.pop_front() {
3203            Some(DfExpr::Literal(ScalarValue::Utf8(Some(d)), _)) => d,
3204            other => UnexpectedPlanExprSnafu {
3205                desc: format!("expected separator string literal, but found {:?}", other),
3206            }
3207            .fail()?,
3208        };
3209
3210        // Create a set of available columns (tag columns + field columns + time index column)
3211        let available_columns: HashSet<&str> = ctx
3212            .tag_columns
3213            .iter()
3214            .chain(ctx.field_columns.iter())
3215            .chain(ctx.time_index_column.as_ref())
3216            .map(|s| s.as_str())
3217            .collect();
3218
3219        let src_labels = other_input_exprs
3220            .iter()
3221            .map(|expr| {
3222                // Cast source label into column or null literal
3223                match expr {
3224                    DfExpr::Literal(ScalarValue::Utf8(Some(label)), None) => {
3225                        if label.is_empty() {
3226                            Ok(DfExpr::Literal(ScalarValue::Null, None))
3227                        } else if available_columns.contains(label.as_str()) {
3228                            // Label exists in the table schema
3229                            Ok(DfExpr::Column(Column::from_name(label)))
3230                        } else {
3231                            // Label doesn't exist, treat as empty string (null)
3232                            Ok(DfExpr::Literal(ScalarValue::Null, None))
3233                        }
3234                    }
3235                    other => UnexpectedPlanExprSnafu {
3236                        desc: format!(
3237                            "expected source label string literal, but found {:?}",
3238                            other
3239                        ),
3240                    }
3241                    .fail(),
3242                }
3243            })
3244            .collect::<Result<Vec<_>>>()?;
3245        ensure!(
3246            !src_labels.is_empty(),
3247            FunctionInvalidArgumentSnafu {
3248                fn_name: "label_join"
3249            }
3250        );
3251
3252        let session_state = query_engine_state.session_state();
3253        let func = session_state
3254            .scalar_functions()
3255            .get("concat_ws")
3256            .context(UnsupportedExprSnafu { name: "concat_ws" })?;
3257
3258        // concat_ws(separator, src_label_1, src_label_2, ...) as dst_label
3259        let mut args = Vec::with_capacity(1 + src_labels.len());
3260        args.push(DfExpr::Literal(ScalarValue::Utf8(Some(separator)), None));
3261        args.extend(src_labels);
3262
3263        Ok((
3264            DfExpr::ScalarFunction(ScalarFunction {
3265                func: func.clone(),
3266                args,
3267            })
3268            .alias(&dst_label),
3269            dst_label,
3270        ))
3271    }
3272
3273    fn create_time_index_column_expr(&self) -> Result<DfExpr> {
3274        Ok(DfExpr::Column(Column::from_name(
3275            self.ctx
3276                .time_index_column
3277                .clone()
3278                .with_context(|| TimeIndexNotFoundSnafu { table: "unknown" })?,
3279        )))
3280    }
3281
3282    fn create_tag_column_exprs(&self) -> Result<Vec<DfExpr>> {
3283        let mut result = Vec::with_capacity(self.ctx.tag_columns.len());
3284        for tag in &self.ctx.tag_columns {
3285            let expr = DfExpr::Column(Column::from_name(tag));
3286            result.push(expr);
3287        }
3288        Ok(result)
3289    }
3290
3291    fn create_field_column_exprs(&self) -> Result<Vec<DfExpr>> {
3292        let mut result = Vec::with_capacity(self.ctx.field_columns.len());
3293        for field in &self.ctx.field_columns {
3294            let expr = DfExpr::Column(Column::from_name(field));
3295            result.push(expr);
3296        }
3297        Ok(result)
3298    }
3299
3300    fn create_tag_and_time_index_column_sort_exprs(&self) -> Result<Vec<SortExpr>> {
3301        let mut result = self
3302            .ctx
3303            .tag_columns
3304            .iter()
3305            .map(|col| DfExpr::Column(Column::from_name(col)).sort(true, true))
3306            .collect::<Vec<_>>();
3307        result.push(self.create_time_index_column_expr()?.sort(true, true));
3308        Ok(result)
3309    }
3310
3311    fn create_field_columns_sort_exprs(&self, asc: bool) -> Vec<SortExpr> {
3312        self.ctx
3313            .field_columns
3314            .iter()
3315            .map(|col| DfExpr::Column(Column::from_name(col)).sort(asc, true))
3316            .collect::<Vec<_>>()
3317    }
3318
3319    fn create_sort_exprs_by_tags(
3320        func: &str,
3321        tags: Vec<DfExpr>,
3322        asc: bool,
3323    ) -> Result<Vec<SortExpr>> {
3324        ensure!(
3325            !tags.is_empty(),
3326            FunctionInvalidArgumentSnafu { fn_name: func }
3327        );
3328
3329        tags.iter()
3330            .map(|col| match col {
3331                DfExpr::Literal(ScalarValue::Utf8(Some(label)), _) => {
3332                    Ok(DfExpr::Column(Column::from_name(label)).sort(asc, false))
3333                }
3334                other => UnexpectedPlanExprSnafu {
3335                    desc: format!("expected label string literal, but found {:?}", other),
3336                }
3337                .fail(),
3338            })
3339            .collect::<Result<Vec<_>>>()
3340    }
3341
3342    fn create_empty_values_filter_expr(&self) -> Result<DfExpr> {
3343        let mut exprs = Vec::with_capacity(self.ctx.field_columns.len());
3344        for value in &self.ctx.field_columns {
3345            let expr = DfExpr::Column(Column::from_name(value)).is_not_null();
3346            exprs.push(expr);
3347        }
3348
3349        // This error context should be computed lazily: the planner may set `ctx.table_name` to
3350        // `None` for derived expressions (e.g. after projecting the LHS of a vector-vector
3351        // comparison filter). Eagerly calling `table_ref()?` here can turn a valid plan into
3352        // a `TableNameNotFound` error even when `conjunction(exprs)` succeeds.
3353        conjunction(exprs).with_context(|| ValueNotFoundSnafu {
3354            table: self
3355                .table_ref()
3356                .map(|t| t.to_quoted_string())
3357                .unwrap_or_else(|_| "unknown".to_string()),
3358        })
3359    }
3360
3361    /// Creates a set of DataFusion `DfExpr::AggregateFunction` expressions for each value column using the specified aggregate function.
3362    ///
3363    /// # Side Effects
3364    ///
3365    /// This method modifies the value columns in the context by replacing them with the new columns
3366    /// created by the aggregate function application.
3367    ///
3368    /// # Returns
3369    ///
3370    /// Returns a tuple of `(aggregate_expressions, previous_field_expressions)` where:
3371    /// - `aggregate_expressions`: Expressions that apply the aggregate function to the original fields
3372    /// - `previous_field_expressions`: Original field expressions before aggregation. This is non-empty
3373    ///   only when the operation is `count_values`, as this operation requires preserving the original
3374    ///   values for grouping.
3375    ///
3376    fn create_aggregate_exprs(
3377        &mut self,
3378        op: TokenType,
3379        param: &Option<Box<PromExpr>>,
3380        input_plan: &LogicalPlan,
3381    ) -> Result<(Vec<DfExpr>, Vec<DfExpr>)> {
3382        let mut non_col_args = Vec::new();
3383        let is_group_agg = op.id() == token::T_GROUP;
3384        if is_group_agg {
3385            ensure!(
3386                self.ctx.field_columns.len() == 1,
3387                MultiFieldsNotSupportedSnafu {
3388                    operator: "group()"
3389                }
3390            );
3391        }
3392        let aggr = match op.id() {
3393            token::T_SUM => sum_udaf(),
3394            token::T_QUANTILE => {
3395                let q =
3396                    Self::get_param_as_literal_expr(param, Some(op), Some(ArrowDataType::Float64))?;
3397                non_col_args.push(q);
3398                quantile_udaf()
3399            }
3400            token::T_AVG => avg_udaf(),
3401            token::T_COUNT_VALUES | token::T_COUNT => count_udaf(),
3402            token::T_MIN => min_udaf(),
3403            token::T_MAX => max_udaf(),
3404            // PromQL's `group()` aggregator produces 1 for each group.
3405            // Use `max(1.0)` (per-group) to match semantics and output type (Float64).
3406            token::T_GROUP => max_udaf(),
3407            token::T_STDDEV => stddev_pop_udaf(),
3408            token::T_STDVAR => var_pop_udaf(),
3409            token::T_TOPK | token::T_BOTTOMK => UnsupportedExprSnafu {
3410                name: format!("{op:?}"),
3411            }
3412            .fail()?,
3413            _ => UnexpectedTokenSnafu { token: op }.fail()?,
3414        };
3415
3416        // perform aggregate operation to each value column
3417        let exprs: Vec<DfExpr> = self
3418            .ctx
3419            .field_columns
3420            .iter()
3421            .map(|col| {
3422                if is_group_agg {
3423                    aggr.call(vec![lit(1_f64)])
3424                } else {
3425                    non_col_args.push(DfExpr::Column(Column::from_name(col)));
3426                    let expr = aggr.call(non_col_args.clone());
3427                    non_col_args.pop();
3428                    expr
3429                }
3430            })
3431            .collect::<Vec<_>>();
3432
3433        // if the aggregator is `count_values`, it must be grouped by current fields.
3434        let prev_field_exprs = if op.id() == token::T_COUNT_VALUES {
3435            let prev_field_exprs: Vec<_> = self
3436                .ctx
3437                .field_columns
3438                .iter()
3439                .map(|col| DfExpr::Column(Column::from_name(col)))
3440                .collect();
3441
3442            ensure!(
3443                self.ctx.field_columns.len() == 1,
3444                UnsupportedExprSnafu {
3445                    name: "count_values on multi-value input"
3446                }
3447            );
3448
3449            prev_field_exprs
3450        } else {
3451            vec![]
3452        };
3453
3454        // update value column name according to the aggregators,
3455        let mut new_field_columns = Vec::with_capacity(self.ctx.field_columns.len());
3456
3457        let normalized_exprs =
3458            normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
3459        for expr in normalized_exprs {
3460            new_field_columns.push(expr.schema_name().to_string());
3461        }
3462        self.ctx.field_columns = new_field_columns;
3463
3464        Ok((exprs, prev_field_exprs))
3465    }
3466
3467    fn get_param_value_as_str(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<&str> {
3468        let param = param
3469            .as_deref()
3470            .with_context(|| FunctionInvalidArgumentSnafu {
3471                fn_name: op.to_string(),
3472            })?;
3473        let PromExpr::StringLiteral(StringLiteral { val }) = param else {
3474            return FunctionInvalidArgumentSnafu {
3475                fn_name: op.to_string(),
3476            }
3477            .fail();
3478        };
3479
3480        Ok(val)
3481    }
3482
3483    fn get_param_as_literal_expr(
3484        param: &Option<Box<PromExpr>>,
3485        op: Option<TokenType>,
3486        expected_type: Option<ArrowDataType>,
3487    ) -> Result<DfExpr> {
3488        let prom_param = param.as_deref().with_context(|| {
3489            if let Some(op) = op {
3490                FunctionInvalidArgumentSnafu {
3491                    fn_name: op.to_string(),
3492                }
3493            } else {
3494                FunctionInvalidArgumentSnafu {
3495                    fn_name: "unknown".to_string(),
3496                }
3497            }
3498        })?;
3499
3500        let expr = Self::try_build_literal_expr(prom_param).with_context(|| {
3501            if let Some(op) = op {
3502                FunctionInvalidArgumentSnafu {
3503                    fn_name: op.to_string(),
3504                }
3505            } else {
3506                FunctionInvalidArgumentSnafu {
3507                    fn_name: "unknown".to_string(),
3508                }
3509            }
3510        })?;
3511
3512        // check if the type is expected
3513        if let Some(expected_type) = expected_type {
3514            // literal should not have reference to column
3515            let expr_type = expr
3516                .get_type(&DFSchema::empty())
3517                .context(DataFusionPlanningSnafu)?;
3518            if expected_type != expr_type {
3519                return FunctionInvalidArgumentSnafu {
3520                    fn_name: format!("expected {expected_type:?}, but found {expr_type:?}"),
3521                }
3522                .fail();
3523            }
3524        }
3525
3526        Ok(expr)
3527    }
3528
3529    /// Create [DfExpr::WindowFunction] expr for each value column with given window function.
3530    ///
3531    fn create_window_exprs(
3532        &mut self,
3533        op: TokenType,
3534        group_exprs: Vec<DfExpr>,
3535        input_plan: &LogicalPlan,
3536    ) -> Result<Vec<DfExpr>> {
3537        ensure!(
3538            self.ctx.field_columns.len() == 1,
3539            UnsupportedExprSnafu {
3540                name: "topk or bottomk on multi-value input"
3541            }
3542        );
3543
3544        assert!(matches!(op.id(), token::T_TOPK | token::T_BOTTOMK));
3545
3546        let asc = matches!(op.id(), token::T_BOTTOMK);
3547
3548        let tag_sort_exprs = self
3549            .create_tag_column_exprs()?
3550            .into_iter()
3551            .map(|expr| expr.sort(asc, true));
3552
3553        // perform window operation to each value column
3554        let exprs: Vec<DfExpr> = self
3555            .ctx
3556            .field_columns
3557            .iter()
3558            .map(|col| {
3559                let mut sort_exprs = Vec::with_capacity(self.ctx.tag_columns.len() + 1);
3560                // Order by value in the specific order
3561                sort_exprs.push(DfExpr::Column(Column::from(col)).sort(asc, true));
3562                // Then tags if the values are equal,
3563                // Try to ensure the relative stability of the output results.
3564                sort_exprs.extend(tag_sort_exprs.clone());
3565
3566                DfExpr::WindowFunction(Box::new(WindowFunction {
3567                    fun: WindowFunctionDefinition::WindowUDF(Arc::new(RowNumber::new().into())),
3568                    params: WindowFunctionParams {
3569                        args: vec![],
3570                        partition_by: group_exprs.clone(),
3571                        order_by: sort_exprs,
3572                        window_frame: WindowFrame::new(Some(true)),
3573                        null_treatment: None,
3574                        distinct: false,
3575                        filter: None,
3576                    },
3577                }))
3578            })
3579            .collect();
3580
3581        let normalized_exprs =
3582            normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
3583        Ok(normalized_exprs)
3584    }
3585
3586    /// Try to build a [f64] from [PromExpr].
3587    #[deprecated(
3588        note = "use `Self::get_param_as_literal_expr` instead. This is only for `create_histogram_plan`"
3589    )]
3590    fn try_build_float_literal(expr: &PromExpr) -> Option<f64> {
3591        match expr {
3592            PromExpr::NumberLiteral(NumberLiteral { val }) => Some(*val),
3593            PromExpr::Paren(ParenExpr { expr }) => Self::try_build_float_literal(expr),
3594            PromExpr::Unary(UnaryExpr { expr, .. }) => {
3595                Self::try_build_float_literal(expr).map(|f| -f)
3596            }
3597            PromExpr::StringLiteral(_)
3598            | PromExpr::Binary(_)
3599            | PromExpr::VectorSelector(_)
3600            | PromExpr::MatrixSelector(_)
3601            | PromExpr::Call(_)
3602            | PromExpr::Extension(_)
3603            | PromExpr::Aggregate(_)
3604            | PromExpr::Subquery(_) => None,
3605        }
3606    }
3607
3608    /// Create a [SPECIAL_HISTOGRAM_QUANTILE] plan.
3609    async fn create_histogram_plan(
3610        &mut self,
3611        args: &PromFunctionArgs,
3612        query_engine_state: &QueryEngineState,
3613    ) -> Result<LogicalPlan> {
3614        if args.args.len() != 2 {
3615            return FunctionInvalidArgumentSnafu {
3616                fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(),
3617            }
3618            .fail();
3619        }
3620        #[allow(deprecated)]
3621        let phi = Self::try_build_float_literal(&args.args[0]).with_context(|| {
3622            FunctionInvalidArgumentSnafu {
3623                fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(),
3624            }
3625        })?;
3626
3627        let input = args.args[1].as_ref().clone();
3628        let input_plan = self.prom_expr_to_plan(&input, query_engine_state).await?;
3629        // `histogram_quantile` folds buckets across `le`, so `__tsid` (which includes `le`) is not
3630        // a stable series identifier anymore. Also, HistogramFold infers label columns from the
3631        // input schema and must not treat `__tsid` as a label column.
3632        let input_plan = self.strip_tsid_column(input_plan)?;
3633        self.ctx.use_tsid = false;
3634
3635        if !self.ctx.has_le_tag() {
3636            // Return empty result instead of error when 'le' column is not found
3637            // This handles the case when histogram metrics don't exist
3638            return Ok(LogicalPlan::EmptyRelation(
3639                datafusion::logical_expr::EmptyRelation {
3640                    produce_one_row: false,
3641                    schema: Arc::new(DFSchema::empty()),
3642                },
3643            ));
3644        }
3645        let time_index_column =
3646            self.ctx
3647                .time_index_column
3648                .clone()
3649                .with_context(|| TimeIndexNotFoundSnafu {
3650                    table: self.ctx.table_name.clone().unwrap_or_default(),
3651                })?;
3652        // FIXME(ruihang): support multi fields
3653        let field_column = self
3654            .ctx
3655            .field_columns
3656            .first()
3657            .with_context(|| FunctionInvalidArgumentSnafu {
3658                fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(),
3659            })?
3660            .clone();
3661        // remove le column from tag columns
3662        self.ctx.tag_columns.retain(|col| col != LE_COLUMN_NAME);
3663
3664        Ok(LogicalPlan::Extension(Extension {
3665            node: Arc::new(
3666                HistogramFold::new(
3667                    LE_COLUMN_NAME.to_string(),
3668                    field_column,
3669                    time_index_column,
3670                    phi,
3671                    input_plan,
3672                )
3673                .context(DataFusionPlanningSnafu)?,
3674            ),
3675        }))
3676    }
3677
3678    /// Create a [SPECIAL_VECTOR_FUNCTION] plan
3679    async fn create_vector_plan(&mut self, args: &PromFunctionArgs) -> Result<LogicalPlan> {
3680        if args.args.len() != 1 {
3681            return FunctionInvalidArgumentSnafu {
3682                fn_name: SPECIAL_VECTOR_FUNCTION.to_string(),
3683            }
3684            .fail();
3685        }
3686        let lit = Self::get_param_as_literal_expr(&Some(args.args[0].clone()), None, None)?;
3687
3688        // reuse `SPECIAL_TIME_FUNCTION` as name of time index column
3689        self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
3690        self.ctx.reset_table_name_and_schema();
3691        self.ctx.tag_columns = vec![];
3692        self.ctx.field_columns = vec![greptime_value().to_string()];
3693        Ok(LogicalPlan::Extension(Extension {
3694            node: Arc::new(
3695                EmptyMetric::new(
3696                    self.ctx.start,
3697                    self.ctx.end,
3698                    self.ctx.interval,
3699                    SPECIAL_TIME_FUNCTION.to_string(),
3700                    greptime_value().to_string(),
3701                    Some(lit),
3702                )
3703                .context(DataFusionPlanningSnafu)?,
3704            ),
3705        }))
3706    }
3707
3708    /// Create a [SCALAR_FUNCTION] plan
3709    async fn create_scalar_plan(
3710        &mut self,
3711        args: &PromFunctionArgs,
3712        query_engine_state: &QueryEngineState,
3713    ) -> Result<LogicalPlan> {
3714        ensure!(
3715            args.len() == 1,
3716            FunctionInvalidArgumentSnafu {
3717                fn_name: SCALAR_FUNCTION
3718            }
3719        );
3720        let input = self
3721            .prom_expr_to_plan(&args.args[0], query_engine_state)
3722            .await?;
3723        ensure!(
3724            self.ctx.field_columns.len() == 1,
3725            MultiFieldsNotSupportedSnafu {
3726                operator: SCALAR_FUNCTION
3727            },
3728        );
3729        let scalar_plan = LogicalPlan::Extension(Extension {
3730            node: Arc::new(
3731                ScalarCalculate::new(
3732                    self.ctx.start,
3733                    self.ctx.end,
3734                    self.ctx.interval,
3735                    input,
3736                    self.ctx.time_index_column.as_ref().unwrap(),
3737                    &self.ctx.tag_columns,
3738                    &self.ctx.field_columns[0],
3739                    self.ctx.table_name.as_deref(),
3740                )
3741                .context(PromqlPlanNodeSnafu)?,
3742            ),
3743        });
3744        // scalar plan have no tag columns
3745        self.ctx.tag_columns.clear();
3746        self.ctx.field_columns.clear();
3747        self.ctx
3748            .field_columns
3749            .push(scalar_plan.schema().field(1).name().clone());
3750        Ok(scalar_plan)
3751    }
3752
3753    /// Create a [SPECIAL_ABSENT_FUNCTION] plan
3754    async fn create_absent_plan(
3755        &mut self,
3756        args: &PromFunctionArgs,
3757        query_engine_state: &QueryEngineState,
3758    ) -> Result<LogicalPlan> {
3759        if args.args.len() != 1 {
3760            return FunctionInvalidArgumentSnafu {
3761                fn_name: SPECIAL_ABSENT_FUNCTION.to_string(),
3762            }
3763            .fail();
3764        }
3765        let input = self
3766            .prom_expr_to_plan(&args.args[0], query_engine_state)
3767            .await?;
3768
3769        let time_index_expr = self.create_time_index_column_expr()?;
3770        let first_field_expr =
3771            self.create_field_column_exprs()?
3772                .pop()
3773                .with_context(|| ValueNotFoundSnafu {
3774                    table: self.ctx.table_name.clone().unwrap_or_default(),
3775                })?;
3776        let first_value_expr = first_value(first_field_expr, vec![]);
3777
3778        let ordered_aggregated_input = LogicalPlanBuilder::from(input)
3779            .aggregate(
3780                vec![time_index_expr.clone()],
3781                vec![first_value_expr.clone()],
3782            )
3783            .context(DataFusionPlanningSnafu)?
3784            .sort(vec![time_index_expr.sort(true, false)])
3785            .context(DataFusionPlanningSnafu)?
3786            .build()
3787            .context(DataFusionPlanningSnafu)?;
3788
3789        let fake_labels = self
3790            .ctx
3791            .selector_matcher
3792            .iter()
3793            .filter_map(|matcher| match matcher.op {
3794                MatchOp::Equal => Some((matcher.name.clone(), matcher.value.clone())),
3795                _ => None,
3796            })
3797            .collect::<Vec<_>>();
3798
3799        // Create the absent plan
3800        let absent_plan = LogicalPlan::Extension(Extension {
3801            node: Arc::new(
3802                Absent::try_new(
3803                    self.ctx.start,
3804                    self.ctx.end,
3805                    self.ctx.interval,
3806                    self.ctx.time_index_column.as_ref().unwrap().clone(),
3807                    self.ctx.field_columns[0].clone(),
3808                    fake_labels,
3809                    ordered_aggregated_input,
3810                )
3811                .context(DataFusionPlanningSnafu)?,
3812            ),
3813        });
3814
3815        Ok(absent_plan)
3816    }
3817
3818    /// Try to build a DataFusion Literal Expression from PromQL Expr, return
3819    /// `None` if the input is not a literal expression.
3820    fn try_build_literal_expr(expr: &PromExpr) -> Option<DfExpr> {
3821        match expr {
3822            PromExpr::NumberLiteral(NumberLiteral { val }) => Some(val.lit()),
3823            PromExpr::StringLiteral(StringLiteral { val }) => Some(val.lit()),
3824            PromExpr::VectorSelector(_)
3825            | PromExpr::MatrixSelector(_)
3826            | PromExpr::Extension(_)
3827            | PromExpr::Aggregate(_)
3828            | PromExpr::Subquery(_) => None,
3829            PromExpr::Call(Call { func, .. }) => {
3830                if func.name == SPECIAL_TIME_FUNCTION {
3831                    // For time() function, don't treat it as a literal
3832                    // Let it be handled as a regular function call
3833                    None
3834                } else {
3835                    None
3836                }
3837            }
3838            PromExpr::Paren(ParenExpr { expr }) => Self::try_build_literal_expr(expr),
3839            // TODO(ruihang): support Unary operator
3840            PromExpr::Unary(UnaryExpr { expr, .. }) => Self::try_build_literal_expr(expr),
3841            PromExpr::Binary(PromBinaryExpr {
3842                lhs,
3843                rhs,
3844                op,
3845                modifier,
3846            }) => {
3847                let lhs = Self::try_build_literal_expr(lhs)?;
3848                let rhs = Self::try_build_literal_expr(rhs)?;
3849                let is_comparison_op = Self::is_token_a_comparison_op(*op);
3850                let expr_builder = Self::prom_token_to_binary_expr_builder(*op).ok()?;
3851                let expr = expr_builder(lhs, rhs).ok()?;
3852
3853                let should_return_bool = if let Some(m) = modifier {
3854                    m.return_bool
3855                } else {
3856                    false
3857                };
3858                if is_comparison_op && should_return_bool {
3859                    Some(DfExpr::Cast(Cast {
3860                        expr: Box::new(expr),
3861                        data_type: ArrowDataType::Float64,
3862                    }))
3863                } else {
3864                    Some(expr)
3865                }
3866            }
3867        }
3868    }
3869
3870    fn try_build_special_time_expr_with_context(&self, expr: &PromExpr) -> Option<DfExpr> {
3871        match expr {
3872            PromExpr::Call(Call { func, .. }) => {
3873                if func.name == SPECIAL_TIME_FUNCTION
3874                    && let Some(time_index_col) = self.ctx.time_index_column.as_ref()
3875                {
3876                    Some(build_special_time_expr(time_index_col))
3877                } else {
3878                    None
3879                }
3880            }
3881            _ => None,
3882        }
3883    }
3884
3885    /// Return a lambda to build binary expression from token.
3886    /// Because some binary operator are function in DataFusion like `atan2` or `^`.
3887    #[allow(clippy::type_complexity)]
3888    fn prom_token_to_binary_expr_builder(
3889        token: TokenType,
3890    ) -> Result<Box<dyn Fn(DfExpr, DfExpr) -> Result<DfExpr>>> {
3891        let cast_float = |expr| {
3892            if matches!(
3893                &expr,
3894                DfExpr::Cast(Cast {
3895                    data_type: ArrowDataType::Float64,
3896                    ..
3897                })
3898            ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _))
3899            {
3900                expr
3901            } else {
3902                DfExpr::Cast(Cast {
3903                    expr: Box::new(expr),
3904                    data_type: ArrowDataType::Float64,
3905                })
3906            }
3907        };
3908        match token.id() {
3909            token::T_ADD => Ok(Box::new(move |lhs, rhs| {
3910                Ok(cast_float(lhs) + cast_float(rhs))
3911            })),
3912            token::T_SUB => Ok(Box::new(move |lhs, rhs| {
3913                Ok(cast_float(lhs) - cast_float(rhs))
3914            })),
3915            token::T_MUL => Ok(Box::new(move |lhs, rhs| {
3916                Ok(cast_float(lhs) * cast_float(rhs))
3917            })),
3918            token::T_DIV => Ok(Box::new(move |lhs, rhs| {
3919                Ok(cast_float(lhs) / cast_float(rhs))
3920            })),
3921            token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| {
3922                Ok(cast_float(lhs) % cast_float(rhs))
3923            })),
3924            token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))),
3925            token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))),
3926            token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))),
3927            token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))),
3928            token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))),
3929            token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))),
3930            token::T_POW => Ok(Box::new(move |lhs, rhs| {
3931                Ok(DfExpr::ScalarFunction(ScalarFunction {
3932                    func: datafusion_functions::math::power(),
3933                    args: vec![cast_float(lhs), cast_float(rhs)],
3934                }))
3935            })),
3936            token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| {
3937                Ok(DfExpr::ScalarFunction(ScalarFunction {
3938                    func: datafusion_functions::math::atan2(),
3939                    args: vec![cast_float(lhs), cast_float(rhs)],
3940                }))
3941            })),
3942            _ => UnexpectedTokenSnafu { token }.fail(),
3943        }
3944    }
3945
3946    /// Check if the given op is a [comparison operator](https://prometheus.io/docs/prometheus/latest/querying/operators/#comparison-binary-operators).
3947    fn is_token_a_comparison_op(token: TokenType) -> bool {
3948        matches!(
3949            token.id(),
3950            token::T_EQLC
3951                | token::T_NEQ
3952                | token::T_GTR
3953                | token::T_LSS
3954                | token::T_GTE
3955                | token::T_LTE
3956        )
3957    }
3958
3959    /// Check if the given op is a set operator (UNION, INTERSECT and EXCEPT in SQL).
3960    fn is_token_a_set_op(token: TokenType) -> bool {
3961        matches!(
3962            token.id(),
3963            token::T_LAND // INTERSECT
3964                | token::T_LOR // UNION
3965                | token::T_LUNLESS // EXCEPT
3966        )
3967    }
3968
3969    fn align_binary_field_columns<'a>(
3970        left_field_columns: &'a [String],
3971        right_field_columns: &'a [String],
3972    ) -> (Vec<String>, Vec<(&'a String, &'a String)>) {
3973        let field_pairs = left_field_columns
3974            .iter()
3975            .zip(right_field_columns.iter())
3976            .collect::<Vec<_>>();
3977        let output_field_columns = field_pairs
3978            .iter()
3979            .map(|(left_col_name, _)| (*left_col_name).clone())
3980            .collect();
3981        (output_field_columns, field_pairs)
3982    }
3983
3984    fn plan_has_tsid_column(plan: &LogicalPlan) -> bool {
3985        plan.schema()
3986            .fields()
3987            .iter()
3988            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
3989    }
3990
3991    fn optional_tsid_projection(
3992        schema: &DFSchemaRef,
3993        table_ref: Option<&TableReference>,
3994        keep_tsid: bool,
3995    ) -> Option<DfExpr> {
3996        keep_tsid.then_some(()).and_then(|_| {
3997            schema
3998                .qualified_field_with_name(table_ref, DATA_SCHEMA_TSID_COLUMN_NAME)
3999                .ok()
4000                .map(|field| DfExpr::Column(field.into()))
4001        })
4002    }
4003
4004    fn binary_join_key_columns(
4005        &self,
4006        left_schema: &DFSchemaRef,
4007        right_schema: &DFSchemaRef,
4008        left_context: &PromPlannerContext,
4009        right_context: &PromPlannerContext,
4010        only_join_time_index: bool,
4011        modifier: &Option<BinModifier>,
4012    ) -> Result<(BTreeSet<String>, BTreeSet<String>, bool)> {
4013        let has_tsid = |schema: &DFSchemaRef| {
4014            schema
4015                .fields()
4016                .iter()
4017                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
4018        };
4019        let use_tsid_join = !only_join_time_index
4020            && self.binary_modifier_preserves_tsid_join_key(left_context, right_context, modifier)
4021            && left_context.use_tsid
4022            && right_context.use_tsid
4023            && has_tsid(left_schema)
4024            && has_tsid(right_schema);
4025
4026        let (mut left_tag_columns, mut right_tag_columns) = if use_tsid_join {
4027            (
4028                BTreeSet::from([DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]),
4029                BTreeSet::from([DATA_SCHEMA_TSID_COLUMN_NAME.to_string()]),
4030            )
4031        } else {
4032            if only_join_time_index {
4033                (BTreeSet::new(), BTreeSet::new())
4034            } else {
4035                (
4036                    left_context
4037                        .tag_columns
4038                        .iter()
4039                        .cloned()
4040                        .collect::<BTreeSet<_>>(),
4041                    right_context
4042                        .tag_columns
4043                        .iter()
4044                        .cloned()
4045                        .collect::<BTreeSet<_>>(),
4046                )
4047            }
4048        };
4049
4050        if !use_tsid_join
4051            && let Some(modifier) = modifier
4052            && let Some(matching) = &modifier.matching
4053        {
4054            match matching {
4055                LabelModifier::Include(on) => {
4056                    let mask = on.labels.iter().cloned().collect::<BTreeSet<_>>();
4057                    left_tag_columns = left_tag_columns.intersection(&mask).cloned().collect();
4058                    right_tag_columns = right_tag_columns.intersection(&mask).cloned().collect();
4059                }
4060                LabelModifier::Exclude(ignoring) => {
4061                    for label in &ignoring.labels {
4062                        let _ = left_tag_columns.remove(label);
4063                        let _ = right_tag_columns.remove(label);
4064                    }
4065                }
4066            }
4067        }
4068
4069        let force_empty_join =
4070            !use_tsid_join && !only_join_time_index && left_tag_columns != right_tag_columns;
4071        if force_empty_join {
4072            let common_tag_columns = left_tag_columns
4073                .intersection(&right_tag_columns)
4074                .cloned()
4075                .collect::<BTreeSet<_>>();
4076            left_tag_columns = common_tag_columns.clone();
4077            right_tag_columns = common_tag_columns;
4078        }
4079
4080        Ok((left_tag_columns, right_tag_columns, force_empty_join))
4081    }
4082
4083    fn binary_modifier_preserves_tsid_join_key(
4084        &self,
4085        left_context: &PromPlannerContext,
4086        right_context: &PromPlannerContext,
4087        modifier: &Option<BinModifier>,
4088    ) -> bool {
4089        let Some(modifier) = modifier else {
4090            return true;
4091        };
4092
4093        if !matches!(modifier.card, VectorMatchCardinality::OneToOne) {
4094            return false;
4095        }
4096
4097        match &modifier.matching {
4098            None => true,
4099            Some(LabelModifier::Exclude(ignoring)) => ignoring.labels.iter().all(|label| {
4100                !left_context.tag_columns.contains(label)
4101                    && !right_context.tag_columns.contains(label)
4102            }),
4103            Some(LabelModifier::Include(on)) => {
4104                let on_labels = on.labels.iter().cloned().collect::<BTreeSet<_>>();
4105                let left_labels = left_context
4106                    .tag_columns
4107                    .iter()
4108                    .cloned()
4109                    .collect::<BTreeSet<_>>();
4110                let right_labels = right_context
4111                    .tag_columns
4112                    .iter()
4113                    .cloned()
4114                    .collect::<BTreeSet<_>>();
4115
4116                on_labels == left_labels && on_labels == right_labels
4117            }
4118        }
4119    }
4120
4121    /// Build a inner join on time index column and tag columns to concat two logical plans.
4122    /// When `only_join_time_index == true` we only join on the time index, because these two plan may not have the same tag columns
4123    #[allow(clippy::too_many_arguments)]
4124    fn join_on_non_field_columns(
4125        &self,
4126        left: LogicalPlan,
4127        right: LogicalPlan,
4128        left_table_ref: TableReference,
4129        right_table_ref: TableReference,
4130        left_time_index_column: Option<String>,
4131        right_time_index_column: Option<String>,
4132        only_join_time_index: bool,
4133        modifier: &Option<BinModifier>,
4134        left_context: &PromPlannerContext,
4135        right_context: &PromPlannerContext,
4136    ) -> Result<LogicalPlan> {
4137        let (mut left_tag_columns, mut right_tag_columns, force_empty_join) = self
4138            .binary_join_key_columns(
4139                left.schema(),
4140                right.schema(),
4141                left_context,
4142                right_context,
4143                only_join_time_index,
4144                modifier,
4145            )?;
4146
4147        // push time index column if it exists
4148        if let (Some(left_time_index_column), Some(right_time_index_column)) =
4149            (left_time_index_column, right_time_index_column)
4150        {
4151            left_tag_columns.insert(left_time_index_column);
4152            right_tag_columns.insert(right_time_index_column);
4153        }
4154
4155        let right = LogicalPlanBuilder::from(right)
4156            .alias(right_table_ref)
4157            .context(DataFusionPlanningSnafu)?
4158            .build()
4159            .context(DataFusionPlanningSnafu)?;
4160
4161        // Inner Join on time index column to concat two operator
4162        LogicalPlanBuilder::from(left)
4163            .alias(left_table_ref)
4164            .context(DataFusionPlanningSnafu)?
4165            .join_detailed(
4166                right,
4167                JoinType::Inner,
4168                (
4169                    left_tag_columns
4170                        .into_iter()
4171                        .map(Column::from_name)
4172                        .collect::<Vec<_>>(),
4173                    right_tag_columns
4174                        .into_iter()
4175                        .map(Column::from_name)
4176                        .collect::<Vec<_>>(),
4177                ),
4178                force_empty_join.then_some(lit(false)),
4179                NullEquality::NullEqualsNull,
4180            )
4181            .context(DataFusionPlanningSnafu)?
4182            .build()
4183            .context(DataFusionPlanningSnafu)
4184    }
4185
4186    /// Build a set operator (AND/OR/UNLESS)
4187    fn set_op_on_non_field_columns(
4188        &mut self,
4189        left: LogicalPlan,
4190        mut right: LogicalPlan,
4191        left_context: PromPlannerContext,
4192        right_context: PromPlannerContext,
4193        op: TokenType,
4194        modifier: &Option<BinModifier>,
4195    ) -> Result<LogicalPlan> {
4196        let mut left_tag_col_set = left_context
4197            .tag_columns
4198            .iter()
4199            .cloned()
4200            .collect::<HashSet<_>>();
4201        let mut right_tag_col_set = right_context
4202            .tag_columns
4203            .iter()
4204            .cloned()
4205            .collect::<HashSet<_>>();
4206
4207        if matches!(op.id(), token::T_LOR) {
4208            return self.or_operator(
4209                left,
4210                right,
4211                left_tag_col_set,
4212                right_tag_col_set,
4213                left_context,
4214                right_context,
4215                modifier,
4216            );
4217        }
4218
4219        // apply modifier
4220        if let Some(modifier) = modifier {
4221            // one-to-many and many-to-one are not supported
4222            ensure!(
4223                matches!(
4224                    modifier.card,
4225                    VectorMatchCardinality::OneToOne | VectorMatchCardinality::ManyToMany
4226                ),
4227                UnsupportedVectorMatchSnafu {
4228                    name: modifier.card.clone(),
4229                },
4230            );
4231            // apply label modifier
4232            if let Some(matching) = &modifier.matching {
4233                match matching {
4234                    // keeps columns mentioned in `on`
4235                    LabelModifier::Include(on) => {
4236                        let mask = on.labels.iter().cloned().collect::<HashSet<_>>();
4237                        left_tag_col_set = left_tag_col_set.intersection(&mask).cloned().collect();
4238                        right_tag_col_set =
4239                            right_tag_col_set.intersection(&mask).cloned().collect();
4240                    }
4241                    // removes columns memtioned in `ignoring`
4242                    LabelModifier::Exclude(ignoring) => {
4243                        // doesn't check existence of label
4244                        for label in &ignoring.labels {
4245                            let _ = left_tag_col_set.remove(label);
4246                            let _ = right_tag_col_set.remove(label);
4247                        }
4248                    }
4249                }
4250            }
4251        }
4252        // ensure two sides have the same tag columns
4253        if !matches!(op.id(), token::T_LOR) {
4254            ensure!(
4255                left_tag_col_set == right_tag_col_set,
4256                CombineTableColumnMismatchSnafu {
4257                    left: left_tag_col_set.into_iter().collect::<Vec<_>>(),
4258                    right: right_tag_col_set.into_iter().collect::<Vec<_>>(),
4259                }
4260            )
4261        };
4262        let left_time_index = left_context.time_index_column.clone().unwrap();
4263        let right_time_index = right_context.time_index_column.clone().unwrap();
4264        let join_keys = left_tag_col_set
4265            .iter()
4266            .cloned()
4267            .chain([left_time_index.clone()])
4268            .collect::<Vec<_>>();
4269        self.ctx.time_index_column = Some(left_time_index.clone());
4270        self.ctx.use_tsid = left_context.use_tsid;
4271
4272        // alias right time index column if necessary
4273        if left_context.time_index_column != right_context.time_index_column {
4274            let right_project_exprs = right
4275                .schema()
4276                .fields()
4277                .iter()
4278                .map(|field| {
4279                    if field.name() == &right_time_index {
4280                        DfExpr::Column(Column::from_name(&right_time_index)).alias(&left_time_index)
4281                    } else {
4282                        DfExpr::Column(Column::from_name(field.name()))
4283                    }
4284                })
4285                .collect::<Vec<_>>();
4286
4287            right = LogicalPlanBuilder::from(right)
4288                .project(right_project_exprs)
4289                .context(DataFusionPlanningSnafu)?
4290                .build()
4291                .context(DataFusionPlanningSnafu)?;
4292        }
4293
4294        ensure!(
4295            left_context.field_columns.len() == 1,
4296            MultiFieldsNotSupportedSnafu {
4297                operator: "AND operator"
4298            }
4299        );
4300        // Update the field column in context.
4301        // The AND/UNLESS operator only keep the field column in left input.
4302        let left_field_col = left_context.field_columns.first().unwrap();
4303        self.ctx.field_columns = vec![left_field_col.clone()];
4304
4305        // Generate join plan.
4306        // All set operations in PromQL are "distinct"
4307        match op.id() {
4308            token::T_LAND => LogicalPlanBuilder::from(left)
4309                .distinct()
4310                .context(DataFusionPlanningSnafu)?
4311                .join_detailed(
4312                    right,
4313                    JoinType::LeftSemi,
4314                    (join_keys.clone(), join_keys),
4315                    None,
4316                    NullEquality::NullEqualsNull,
4317                )
4318                .context(DataFusionPlanningSnafu)?
4319                .build()
4320                .context(DataFusionPlanningSnafu),
4321            token::T_LUNLESS => LogicalPlanBuilder::from(left)
4322                .distinct()
4323                .context(DataFusionPlanningSnafu)?
4324                .join_detailed(
4325                    right,
4326                    JoinType::LeftAnti,
4327                    (join_keys.clone(), join_keys),
4328                    None,
4329                    NullEquality::NullEqualsNull,
4330                )
4331                .context(DataFusionPlanningSnafu)?
4332                .build()
4333                .context(DataFusionPlanningSnafu),
4334            token::T_LOR => {
4335                // OR is handled at the beginning of this function, as it cannot
4336                // be expressed using JOIN like AND and UNLESS.
4337                unreachable!()
4338            }
4339            _ => UnexpectedTokenSnafu { token: op }.fail(),
4340        }
4341    }
4342
4343    // TODO(ruihang): change function name
4344    #[allow(clippy::too_many_arguments)]
4345    fn or_operator(
4346        &mut self,
4347        left: LogicalPlan,
4348        right: LogicalPlan,
4349        left_tag_cols_set: HashSet<String>,
4350        right_tag_cols_set: HashSet<String>,
4351        left_context: PromPlannerContext,
4352        right_context: PromPlannerContext,
4353        modifier: &Option<BinModifier>,
4354    ) -> Result<LogicalPlan> {
4355        // checks
4356        ensure!(
4357            left_context.field_columns.len() == right_context.field_columns.len(),
4358            CombineTableColumnMismatchSnafu {
4359                left: left_context.field_columns.clone(),
4360                right: right_context.field_columns.clone()
4361            }
4362        );
4363        ensure!(
4364            left_context.field_columns.len() == 1,
4365            MultiFieldsNotSupportedSnafu {
4366                operator: "OR operator"
4367            }
4368        );
4369
4370        // prepare hash sets
4371        let all_tags = left_tag_cols_set
4372            .union(&right_tag_cols_set)
4373            .cloned()
4374            .collect::<HashSet<_>>();
4375        let tags_not_in_left = all_tags
4376            .difference(&left_tag_cols_set)
4377            .cloned()
4378            .collect::<Vec<_>>();
4379        let tags_not_in_right = all_tags
4380            .difference(&right_tag_cols_set)
4381            .cloned()
4382            .collect::<Vec<_>>();
4383        let left_qualifier = left.schema().qualified_field(0).0.cloned();
4384        let right_qualifier = right.schema().qualified_field(0).0.cloned();
4385        let left_qualifier_string = left_qualifier
4386            .as_ref()
4387            .map(|l| l.to_string())
4388            .unwrap_or_default();
4389        let right_qualifier_string = right_qualifier
4390            .as_ref()
4391            .map(|r| r.to_string())
4392            .unwrap_or_default();
4393        let left_time_index_column =
4394            left_context
4395                .time_index_column
4396                .clone()
4397                .with_context(|| TimeIndexNotFoundSnafu {
4398                    table: left_qualifier_string.clone(),
4399                })?;
4400        let right_time_index_column =
4401            right_context
4402                .time_index_column
4403                .clone()
4404                .with_context(|| TimeIndexNotFoundSnafu {
4405                    table: right_qualifier_string.clone(),
4406                })?;
4407        // Take the name of first field column. The length is checked above.
4408        let left_field_col = left_context.field_columns.first().unwrap();
4409        let right_field_col = right_context.field_columns.first().unwrap();
4410        let left_has_tsid = left
4411            .schema()
4412            .fields()
4413            .iter()
4414            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME);
4415        let right_has_tsid = right
4416            .schema()
4417            .fields()
4418            .iter()
4419            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME);
4420
4421        // step 0: fill all columns in output schema
4422        let mut all_columns_set = left
4423            .schema()
4424            .fields()
4425            .iter()
4426            .chain(right.schema().fields().iter())
4427            .map(|field| field.name().clone())
4428            .collect::<HashSet<_>>();
4429        // Keep `__tsid` only when both sides contain it, otherwise it may break schema alignment
4430        // (e.g. `unknown_metric or some_metric`).
4431        if !(left_has_tsid && right_has_tsid) {
4432            all_columns_set.remove(DATA_SCHEMA_TSID_COLUMN_NAME);
4433        }
4434        // remove time index column
4435        all_columns_set.remove(&left_time_index_column);
4436        all_columns_set.remove(&right_time_index_column);
4437        // remove field column in the right
4438        if left_field_col != right_field_col {
4439            all_columns_set.remove(right_field_col);
4440        }
4441        let mut all_columns = all_columns_set.into_iter().collect::<Vec<_>>();
4442        // sort to ensure the generated schema is not volatile
4443        all_columns.sort_unstable();
4444        // use left time index column name as the result time index column name
4445        all_columns.insert(0, left_time_index_column.clone());
4446
4447        // step 1: align schema using project, fill non-exist columns with null
4448        let left_proj_exprs = all_columns.iter().map(|col| {
4449            if tags_not_in_left.contains(col) {
4450                DfExpr::Literal(ScalarValue::Utf8(None), None).alias(col.clone())
4451            } else {
4452                DfExpr::Column(Column::new(None::<String>, col))
4453            }
4454        });
4455        let right_time_index_expr = DfExpr::Column(Column::new(
4456            right_qualifier.clone(),
4457            right_time_index_column,
4458        ))
4459        .alias(left_time_index_column.clone());
4460        // The field column in right side may not have qualifier (it may be removed by join operation),
4461        // so we need to find it from the schema.
4462        let right_qualifier_for_field = right
4463            .schema()
4464            .iter()
4465            .find(|(_, f)| f.name() == right_field_col)
4466            .map(|(q, _)| q)
4467            .with_context(|| ColumnNotFoundSnafu {
4468                col: right_field_col.clone(),
4469            })?
4470            .cloned();
4471
4472        // `skip(1)` to skip the time index column
4473        let right_proj_exprs_without_time_index = all_columns.iter().skip(1).map(|col| {
4474            // expr
4475            if col == left_field_col && left_field_col != right_field_col {
4476                // qualify field in right side if necessary to handle different field name
4477                DfExpr::Column(Column::new(
4478                    right_qualifier_for_field.clone(),
4479                    right_field_col,
4480                ))
4481            } else if tags_not_in_right.contains(col) {
4482                DfExpr::Literal(ScalarValue::Utf8(None), None).alias(col.clone())
4483            } else {
4484                DfExpr::Column(Column::new(None::<String>, col))
4485            }
4486        });
4487        let right_proj_exprs = [right_time_index_expr]
4488            .into_iter()
4489            .chain(right_proj_exprs_without_time_index);
4490
4491        let left_projected = LogicalPlanBuilder::from(left)
4492            .project(left_proj_exprs)
4493            .context(DataFusionPlanningSnafu)?
4494            .alias(left_qualifier_string.clone())
4495            .context(DataFusionPlanningSnafu)?
4496            .build()
4497            .context(DataFusionPlanningSnafu)?;
4498        let right_projected = LogicalPlanBuilder::from(right)
4499            .project(right_proj_exprs)
4500            .context(DataFusionPlanningSnafu)?
4501            .alias(right_qualifier_string.clone())
4502            .context(DataFusionPlanningSnafu)?
4503            .build()
4504            .context(DataFusionPlanningSnafu)?;
4505
4506        // step 2: compute match columns
4507        let mut match_columns = if let Some(modifier) = modifier
4508            && let Some(matching) = &modifier.matching
4509        {
4510            match matching {
4511                // keeps columns mentioned in `on`
4512                LabelModifier::Include(on) => on.labels.clone(),
4513                // removes columns memtioned in `ignoring`
4514                LabelModifier::Exclude(ignoring) => {
4515                    let ignoring = ignoring.labels.iter().cloned().collect::<HashSet<_>>();
4516                    all_tags.difference(&ignoring).cloned().collect()
4517                }
4518            }
4519        } else {
4520            all_tags.iter().cloned().collect()
4521        };
4522        // sort to ensure the generated plan is not volatile
4523        match_columns.sort_unstable();
4524        // step 3: build `UnionDistinctOn` plan
4525        let schema = left_projected.schema().clone();
4526        let union_distinct_on = UnionDistinctOn::new(
4527            left_projected,
4528            right_projected,
4529            match_columns,
4530            left_time_index_column.clone(),
4531            schema,
4532        );
4533        let result = LogicalPlan::Extension(Extension {
4534            node: Arc::new(union_distinct_on),
4535        });
4536
4537        // step 4: update context
4538        self.ctx.time_index_column = Some(left_time_index_column);
4539        self.ctx.tag_columns = all_tags.into_iter().collect();
4540        self.ctx.field_columns = vec![left_field_col.clone()];
4541        self.ctx.use_tsid = left_has_tsid && right_has_tsid;
4542
4543        Ok(result)
4544    }
4545
4546    /// Build a projection that project and perform operation expr for every value columns.
4547    /// Non-value columns (tag and timestamp) will be preserved in the projection.
4548    ///
4549    /// # Side effect
4550    ///
4551    /// This function will update the value columns in the context. Those new column names
4552    /// don't contains qualifier.
4553    fn projection_for_each_field_column<F>(
4554        &mut self,
4555        input: LogicalPlan,
4556        name_to_expr: F,
4557    ) -> Result<LogicalPlan>
4558    where
4559        F: FnMut(&String) -> Result<DfExpr>,
4560    {
4561        let table_ref = self.ctx.table_name.clone().map(TableReference::bare);
4562        let non_field_columns_iter = self
4563            .ctx
4564            .tag_columns
4565            .iter()
4566            .chain(self.ctx.time_index_column.iter())
4567            .map(|col| Ok(DfExpr::Column(Column::new(table_ref.clone(), col))));
4568        let tsid_iter =
4569            Self::optional_tsid_projection(input.schema(), table_ref.as_ref(), self.ctx.use_tsid)
4570                .into_iter()
4571                .map(Ok);
4572
4573        // build computation exprs
4574        let result_field_columns = self
4575            .ctx
4576            .field_columns
4577            .iter()
4578            .map(name_to_expr)
4579            .collect::<Result<Vec<_>>>()?;
4580
4581        // alias the computation exprs to remove qualifier
4582        self.ctx.field_columns = result_field_columns
4583            .iter()
4584            .map(|expr| expr.schema_name().to_string())
4585            .collect();
4586        let field_columns_iter = result_field_columns
4587            .into_iter()
4588            .zip(self.ctx.field_columns.iter())
4589            .map(|(expr, name)| Ok(DfExpr::Alias(Alias::new(expr, None::<String>, name))));
4590
4591        // chain non-field columns (unchanged) and field columns (applied computation then alias)
4592        let project_fields = non_field_columns_iter
4593            .chain(tsid_iter)
4594            .chain(field_columns_iter)
4595            .collect::<Result<Vec<_>>>()?;
4596
4597        LogicalPlanBuilder::from(input)
4598            .project(project_fields)
4599            .context(DataFusionPlanningSnafu)?
4600            .build()
4601            .context(DataFusionPlanningSnafu)
4602    }
4603
4604    /// Build a filter plan that filter on value column. Notice that only one value column
4605    /// is expected.
4606    fn filter_on_field_column<F>(
4607        &self,
4608        input: LogicalPlan,
4609        mut name_to_expr: F,
4610    ) -> Result<LogicalPlan>
4611    where
4612        F: FnMut(&String) -> Result<DfExpr>,
4613    {
4614        ensure!(
4615            self.ctx.field_columns.len() == 1,
4616            UnsupportedExprSnafu {
4617                name: "filter on multi-value input"
4618            }
4619        );
4620
4621        let field_column_filter = name_to_expr(&self.ctx.field_columns[0])?;
4622
4623        LogicalPlanBuilder::from(input)
4624            .filter(field_column_filter)
4625            .context(DataFusionPlanningSnafu)?
4626            .build()
4627            .context(DataFusionPlanningSnafu)
4628    }
4629
4630    /// Generate an expr like `date_part("hour", <TIME_INDEX>)`. Caller should ensure the
4631    /// time index column in context is set
4632    fn date_part_on_time_index(&self, date_part: &str) -> Result<DfExpr> {
4633        let input_expr = datafusion::logical_expr::col(
4634            self.ctx
4635                .time_index_column
4636                .as_ref()
4637                // table name doesn't matters here
4638                .with_context(|| TimeIndexNotFoundSnafu {
4639                    table: "<doesn't matter>",
4640                })?,
4641        );
4642        let fn_expr = DfExpr::ScalarFunction(ScalarFunction {
4643            func: datafusion_functions::datetime::date_part(),
4644            args: vec![date_part.lit(), input_expr],
4645        });
4646        Ok(fn_expr)
4647    }
4648
4649    fn strip_tsid_column(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
4650        let schema = plan.schema();
4651        if !schema
4652            .fields()
4653            .iter()
4654            .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
4655        {
4656            return Ok(plan);
4657        }
4658
4659        // Preserve column qualifiers so downstream plan nodes can keep referencing
4660        // the columns by their original qualified names.
4661        let project_exprs = schema
4662            .iter()
4663            .filter(|(_, field)| field.name() != DATA_SCHEMA_TSID_COLUMN_NAME)
4664            .map(|(qualifier, field)| {
4665                DfExpr::Column(Column::new(qualifier.cloned(), field.name().clone()))
4666            })
4667            .collect::<Vec<_>>();
4668
4669        LogicalPlanBuilder::from(plan)
4670            .project(project_exprs)
4671            .context(DataFusionPlanningSnafu)?
4672            .build()
4673            .context(DataFusionPlanningSnafu)
4674    }
4675
4676    /// Apply an alias to the query result by adding a projection with the alias name
4677    fn apply_alias(&mut self, plan: LogicalPlan, alias_name: String) -> Result<LogicalPlan> {
4678        let fields_expr = self.create_field_column_exprs()?;
4679
4680        // TODO(dennis): how to support multi-value aliasing?
4681        ensure!(
4682            fields_expr.len() == 1,
4683            UnsupportedExprSnafu {
4684                name: "alias on multi-value result"
4685            }
4686        );
4687
4688        let project_fields = fields_expr
4689            .into_iter()
4690            .map(|expr| expr.alias(&alias_name))
4691            .chain(self.create_tag_column_exprs()?)
4692            .chain(Some(self.create_time_index_column_expr()?));
4693
4694        LogicalPlanBuilder::from(plan)
4695            .project(project_fields)
4696            .context(DataFusionPlanningSnafu)?
4697            .build()
4698            .context(DataFusionPlanningSnafu)
4699    }
4700}
4701
4702#[derive(Default, Debug)]
4703struct FunctionArgs {
4704    input: Option<PromExpr>,
4705    literals: Vec<DfExpr>,
4706}
4707
4708/// Represents different types of scalar functions supported in PromQL expressions.
4709/// Each variant defines how the function should be processed and what arguments it expects.
4710#[derive(Debug, Clone)]
4711enum ScalarFunc {
4712    /// DataFusion's registered(including built-in) scalar functions (e.g., abs, sqrt, round, clamp).
4713    /// These are passed through directly to DataFusion's execution engine.
4714    /// Processing: Simple argument insertion at the specified position.
4715    DataFusionBuiltin(Arc<ScalarUdfDef>),
4716    /// User-defined functions registered in DataFusion's function registry.
4717    /// Similar to DataFusionBuiltin but for custom functions not built into DataFusion.
4718    /// Processing: Direct pass-through with argument positioning.
4719    DataFusionUdf(Arc<ScalarUdfDef>),
4720    /// PromQL-specific functions that operate on time series data with temporal context.
4721    /// These functions require both timestamp ranges and values to perform calculations.
4722    /// Processing: Automatically injects timestamp_range and value columns as first arguments.
4723    /// Examples: idelta, irate, resets, changes, deriv, *_over_time function
4724    Udf(Arc<ScalarUdfDef>),
4725    /// PromQL functions requiring extrapolation calculations with explicit range information.
4726    /// These functions need to know the time range length to perform rate calculations.
4727    /// The second field contains the range length in milliseconds.
4728    /// Processing: Injects timestamp_range, value, time_index columns and appends range_length.
4729    /// Examples: increase, rate, delta
4730    // TODO(ruihang): maybe merge with Udf later
4731    ExtrapolateUdf(Arc<ScalarUdfDef>, i64),
4732    /// Functions that generate expressions directly without external UDF calls.
4733    /// The expression is constructed during function matching and requires no additional processing.
4734    /// Examples: time(), minute(), hour(), month(), year() and other date/time extractors
4735    GeneratedExpr,
4736}
4737
4738#[cfg(test)]
4739mod test {
4740    use std::time::{Duration, UNIX_EPOCH};
4741
4742    use catalog::RegisterTableRequest;
4743    use catalog::memory::{MemoryCatalogManager, new_memory_catalog_manager};
4744    use common_base::Plugins;
4745    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
4746    use common_query::prelude::greptime_timestamp;
4747    use common_query::test_util::DummyDecoder;
4748    use datafusion::arrow::datatypes::Schema as ArrowSchema;
4749    use datafusion::datasource::memory::MemorySourceConfig;
4750    use datafusion::datasource::source::DataSourceExec;
4751    use datafusion::logical_expr::Extension;
4752    use datatypes::prelude::ConcreteDataType;
4753    use datatypes::schema::{ColumnSchema, Schema};
4754    use promql_parser::label::Labels;
4755    use promql_parser::parser;
4756    use session::context::QueryContext;
4757    use table::metadata::{TableInfoBuilder, TableMetaBuilder};
4758    use table::test_util::EmptyTable;
4759
4760    use super::*;
4761    use crate::QueryEngineContext;
4762    use crate::options::QueryOptions;
4763    use crate::parser::QueryLanguageParser;
4764
4765    fn find_instant_manipulate(plan: &LogicalPlan) -> Option<&InstantManipulate> {
4766        if let LogicalPlan::Extension(Extension { node }) = plan
4767            && let Some(instant_manipulate) = node.as_any().downcast_ref::<InstantManipulate>()
4768        {
4769            return Some(instant_manipulate);
4770        }
4771
4772        plan.inputs().into_iter().find_map(find_instant_manipulate)
4773    }
4774
4775    fn build_query_engine_state() -> QueryEngineState {
4776        QueryEngineState::new(
4777            new_memory_catalog_manager().unwrap(),
4778            None,
4779            None,
4780            None,
4781            None,
4782            None,
4783            false,
4784            Plugins::default(),
4785            QueryOptions::default(),
4786        )
4787    }
4788
4789    async fn build_optimized_promql_plan(
4790        table_provider: DfTableSourceProvider,
4791        eval_stmt: &EvalStmt,
4792    ) -> LogicalPlan {
4793        let state = build_query_engine_state();
4794        let raw_plan = PromPlanner::stmt_to_plan(table_provider, eval_stmt, &state)
4795            .await
4796            .unwrap();
4797        let context = QueryEngineContext::new(state.session_state(), QueryContext::arc());
4798        state
4799            .optimize_by_extension_rules(raw_plan, &context)
4800            .unwrap()
4801    }
4802
4803    async fn build_optimized_tsid_plan(
4804        query: &str,
4805        num_tag: usize,
4806        num_field: usize,
4807        end_secs: u64,
4808        lookback_secs: u64,
4809    ) -> String {
4810        let eval_stmt = EvalStmt {
4811            expr: parser::parse(query).unwrap(),
4812            start: UNIX_EPOCH,
4813            end: UNIX_EPOCH
4814                .checked_add(Duration::from_secs(end_secs))
4815                .unwrap(),
4816            interval: Duration::from_secs(5),
4817            lookback_delta: Duration::from_secs(lookback_secs),
4818        };
4819        let table_provider = build_test_table_provider_with_tsid(
4820            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
4821            num_tag,
4822            num_field,
4823        )
4824        .await;
4825
4826        build_optimized_promql_plan(table_provider, &eval_stmt)
4827            .await
4828            .display_indent_schema()
4829            .to_string()
4830    }
4831
4832    async fn assert_nested_count_rewrite_applies(query: &str, expected_outer_agg: &str) {
4833        let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
4834
4835        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
4836        assert!(plan_str.contains("Projection: some_metric.timestamp, some_metric.tag_0"));
4837        assert!(plan_str.contains("Distinct:"));
4838        assert!(plan_str.contains(expected_outer_agg), "{plan_str}");
4839        assert!(!plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]"));
4840    }
4841
4842    async fn assert_nested_count_rewrite_missing(query: &str, num_tag: usize, lookback_secs: u64) {
4843        let plan_str = build_optimized_tsid_plan(query, num_tag, 1, 100_000, lookback_secs).await;
4844        assert!(!plan_str.contains("Distinct:"), "{plan_str}");
4845    }
4846
4847    fn build_eval_stmt(expr: &str) -> EvalStmt {
4848        EvalStmt {
4849            expr: parser::parse(expr).unwrap(),
4850            start: UNIX_EPOCH,
4851            end: UNIX_EPOCH
4852                .checked_add(Duration::from_secs(100_000))
4853                .unwrap(),
4854            interval: Duration::from_secs(5),
4855            lookback_delta: Duration::from_secs(1),
4856        }
4857    }
4858
4859    async fn build_test_table_provider(
4860        table_name_tuples: &[(String, String)],
4861        num_tag: usize,
4862        num_field: usize,
4863    ) -> DfTableSourceProvider {
4864        let catalog_list = MemoryCatalogManager::with_default_setup();
4865        for (schema_name, table_name) in table_name_tuples {
4866            let mut columns = vec![];
4867            for i in 0..num_tag {
4868                columns.push(ColumnSchema::new(
4869                    format!("tag_{i}"),
4870                    ConcreteDataType::string_datatype(),
4871                    false,
4872                ));
4873            }
4874            columns.push(
4875                ColumnSchema::new(
4876                    "timestamp".to_string(),
4877                    ConcreteDataType::timestamp_millisecond_datatype(),
4878                    false,
4879                )
4880                .with_time_index(true),
4881            );
4882            for i in 0..num_field {
4883                columns.push(ColumnSchema::new(
4884                    format!("field_{i}"),
4885                    ConcreteDataType::float64_datatype(),
4886                    true,
4887                ));
4888            }
4889            let schema = Arc::new(Schema::new(columns));
4890            let table_meta = TableMetaBuilder::empty()
4891                .schema(schema)
4892                .primary_key_indices((0..num_tag).collect())
4893                .value_indices((num_tag + 1..num_tag + 1 + num_field).collect())
4894                .next_column_id(1024)
4895                .build()
4896                .unwrap();
4897            let table_info = TableInfoBuilder::default()
4898                .name(table_name.clone())
4899                .meta(table_meta)
4900                .build()
4901                .unwrap();
4902            let table = EmptyTable::from_table_info(&table_info);
4903
4904            assert!(
4905                catalog_list
4906                    .register_table_sync(RegisterTableRequest {
4907                        catalog: DEFAULT_CATALOG_NAME.to_string(),
4908                        schema: schema_name.clone(),
4909                        table_name: table_name.clone(),
4910                        table_id: 1024,
4911                        table,
4912                    })
4913                    .is_ok()
4914            );
4915        }
4916
4917        DfTableSourceProvider::new(
4918            catalog_list,
4919            false,
4920            QueryContext::arc(),
4921            DummyDecoder::arc(),
4922            false,
4923        )
4924    }
4925
4926    async fn build_test_table_provider_with_tsid(
4927        table_name_tuples: &[(String, String)],
4928        num_tag: usize,
4929        num_field: usize,
4930    ) -> DfTableSourceProvider {
4931        let table_specs = table_name_tuples
4932            .iter()
4933            .map(|(schema_name, table_name)| ((schema_name.clone(), table_name.clone()), num_field))
4934            .collect::<Vec<_>>();
4935        build_test_table_provider_with_tsid_fields(&table_specs, num_tag).await
4936    }
4937
4938    async fn build_test_table_provider_with_tsid_fields(
4939        table_specs: &[((String, String), usize)],
4940        num_tag: usize,
4941    ) -> DfTableSourceProvider {
4942        let table_specs = table_specs
4943            .iter()
4944            .map(|(table_name_tuple, num_field)| (table_name_tuple.clone(), num_tag, *num_field))
4945            .collect::<Vec<_>>();
4946        build_test_table_provider_with_tsid_tag_fields(&table_specs).await
4947    }
4948
4949    async fn build_test_table_provider_with_tsid_tag_fields(
4950        table_specs: &[((String, String), usize, usize)],
4951    ) -> DfTableSourceProvider {
4952        let catalog_list = MemoryCatalogManager::with_default_setup();
4953
4954        let physical_table_name = "phy";
4955        let physical_table_id = 999u32;
4956        let physical_num_tag = table_specs
4957            .iter()
4958            .map(|(_, num_tag, _)| *num_tag)
4959            .max()
4960            .unwrap_or(0);
4961        let physical_num_field = table_specs
4962            .iter()
4963            .map(|(_, _, num_field)| *num_field)
4964            .max()
4965            .unwrap_or(0);
4966
4967        // Register a metric engine physical table with internal columns.
4968        {
4969            let mut columns = vec![
4970                ColumnSchema::new(
4971                    DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(),
4972                    ConcreteDataType::uint32_datatype(),
4973                    false,
4974                ),
4975                ColumnSchema::new(
4976                    DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
4977                    ConcreteDataType::uint64_datatype(),
4978                    false,
4979                ),
4980            ];
4981            for i in 0..physical_num_tag {
4982                columns.push(ColumnSchema::new(
4983                    format!("tag_{i}"),
4984                    ConcreteDataType::string_datatype(),
4985                    false,
4986                ));
4987            }
4988            columns.push(
4989                ColumnSchema::new(
4990                    "timestamp".to_string(),
4991                    ConcreteDataType::timestamp_millisecond_datatype(),
4992                    false,
4993                )
4994                .with_time_index(true),
4995            );
4996            for i in 0..physical_num_field {
4997                columns.push(ColumnSchema::new(
4998                    format!("field_{i}"),
4999                    ConcreteDataType::float64_datatype(),
5000                    true,
5001                ));
5002            }
5003
5004            let schema = Arc::new(Schema::new(columns));
5005            let primary_key_indices = (0..(2 + physical_num_tag)).collect::<Vec<_>>();
5006            let table_meta = TableMetaBuilder::empty()
5007                .schema(schema)
5008                .primary_key_indices(primary_key_indices)
5009                .value_indices(
5010                    (2 + physical_num_tag..2 + physical_num_tag + 1 + physical_num_field).collect(),
5011                )
5012                .engine(METRIC_ENGINE_NAME.to_string())
5013                .next_column_id(1024)
5014                .build()
5015                .unwrap();
5016            let table_info = TableInfoBuilder::default()
5017                .table_id(physical_table_id)
5018                .name(physical_table_name)
5019                .meta(table_meta)
5020                .build()
5021                .unwrap();
5022            let table = EmptyTable::from_table_info(&table_info);
5023
5024            assert!(
5025                catalog_list
5026                    .register_table_sync(RegisterTableRequest {
5027                        catalog: DEFAULT_CATALOG_NAME.to_string(),
5028                        schema: DEFAULT_SCHEMA_NAME.to_string(),
5029                        table_name: physical_table_name.to_string(),
5030                        table_id: physical_table_id,
5031                        table,
5032                    })
5033                    .is_ok()
5034            );
5035        }
5036
5037        // Register metric engine logical tables without `__tsid`, referencing the physical table.
5038        for (idx, ((schema_name, table_name), num_tag, num_field)) in table_specs.iter().enumerate()
5039        {
5040            let mut columns = vec![];
5041            for i in 0..*num_tag {
5042                columns.push(ColumnSchema::new(
5043                    format!("tag_{i}"),
5044                    ConcreteDataType::string_datatype(),
5045                    false,
5046                ));
5047            }
5048            columns.push(
5049                ColumnSchema::new(
5050                    "timestamp".to_string(),
5051                    ConcreteDataType::timestamp_millisecond_datatype(),
5052                    false,
5053                )
5054                .with_time_index(true),
5055            );
5056            for i in 0..*num_field {
5057                columns.push(ColumnSchema::new(
5058                    format!("field_{i}"),
5059                    ConcreteDataType::float64_datatype(),
5060                    true,
5061                ));
5062            }
5063
5064            let schema = Arc::new(Schema::new(columns));
5065            let mut options = table::requests::TableOptions::default();
5066            options.extra_options.insert(
5067                LOGICAL_TABLE_METADATA_KEY.to_string(),
5068                physical_table_name.to_string(),
5069            );
5070            let table_id = 1024u32 + idx as u32;
5071            let table_meta = TableMetaBuilder::empty()
5072                .schema(schema)
5073                .primary_key_indices((0..*num_tag).collect())
5074                .value_indices((*num_tag + 1..*num_tag + 1 + *num_field).collect())
5075                .engine(METRIC_ENGINE_NAME.to_string())
5076                .options(options)
5077                .next_column_id(1024)
5078                .build()
5079                .unwrap();
5080            let table_info = TableInfoBuilder::default()
5081                .table_id(table_id)
5082                .name(table_name.clone())
5083                .meta(table_meta)
5084                .build()
5085                .unwrap();
5086            let table = EmptyTable::from_table_info(&table_info);
5087
5088            assert!(
5089                catalog_list
5090                    .register_table_sync(RegisterTableRequest {
5091                        catalog: DEFAULT_CATALOG_NAME.to_string(),
5092                        schema: schema_name.clone(),
5093                        table_name: table_name.clone(),
5094                        table_id,
5095                        table,
5096                    })
5097                    .is_ok()
5098            );
5099        }
5100
5101        DfTableSourceProvider::new(
5102            catalog_list,
5103            false,
5104            QueryContext::arc(),
5105            DummyDecoder::arc(),
5106            false,
5107        )
5108    }
5109
5110    async fn build_test_table_provider_with_fields(
5111        table_name_tuples: &[(String, String)],
5112        tags: &[&str],
5113    ) -> DfTableSourceProvider {
5114        let catalog_list = MemoryCatalogManager::with_default_setup();
5115        for (schema_name, table_name) in table_name_tuples {
5116            let mut columns = vec![];
5117            let num_tag = tags.len();
5118            for tag in tags {
5119                columns.push(ColumnSchema::new(
5120                    tag.to_string(),
5121                    ConcreteDataType::string_datatype(),
5122                    false,
5123                ));
5124            }
5125            columns.push(
5126                ColumnSchema::new(
5127                    greptime_timestamp().to_string(),
5128                    ConcreteDataType::timestamp_millisecond_datatype(),
5129                    false,
5130                )
5131                .with_time_index(true),
5132            );
5133            columns.push(ColumnSchema::new(
5134                greptime_value().to_string(),
5135                ConcreteDataType::float64_datatype(),
5136                true,
5137            ));
5138            let schema = Arc::new(Schema::new(columns));
5139            let table_meta = TableMetaBuilder::empty()
5140                .schema(schema)
5141                .primary_key_indices((0..num_tag).collect())
5142                .next_column_id(1024)
5143                .build()
5144                .unwrap();
5145            let table_info = TableInfoBuilder::default()
5146                .name(table_name.clone())
5147                .meta(table_meta)
5148                .build()
5149                .unwrap();
5150            let table = EmptyTable::from_table_info(&table_info);
5151
5152            assert!(
5153                catalog_list
5154                    .register_table_sync(RegisterTableRequest {
5155                        catalog: DEFAULT_CATALOG_NAME.to_string(),
5156                        schema: schema_name.clone(),
5157                        table_name: table_name.clone(),
5158                        table_id: 1024,
5159                        table,
5160                    })
5161                    .is_ok()
5162            );
5163        }
5164
5165        DfTableSourceProvider::new(
5166            catalog_list,
5167            false,
5168            QueryContext::arc(),
5169            DummyDecoder::arc(),
5170            false,
5171        )
5172    }
5173
5174    // {
5175    //     input: `abs(some_metric{foo!="bar"})`,
5176    //     expected: &Call{
5177    //         Func: MustGetFunction("abs"),
5178    //         Args: Expressions{
5179    //             &VectorSelector{
5180    //                 Name: "some_metric",
5181    //                 LabelMatchers: []*labels.Matcher{
5182    //                     MustLabelMatcher(labels.MatchNotEqual, "foo", "bar"),
5183    //                     MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
5184    //                 },
5185    //             },
5186    //         },
5187    //     },
5188    // },
5189    async fn do_single_instant_function_call(fn_name: &'static str, plan_name: &str) {
5190        let prom_expr =
5191            parser::parse(&format!("{fn_name}(some_metric{{tag_0!=\"bar\"}})")).unwrap();
5192        let eval_stmt = EvalStmt {
5193            expr: prom_expr,
5194            start: UNIX_EPOCH,
5195            end: UNIX_EPOCH
5196                .checked_add(Duration::from_secs(100_000))
5197                .unwrap(),
5198            interval: Duration::from_secs(5),
5199            lookback_delta: Duration::from_secs(1),
5200        };
5201
5202        let table_provider = build_test_table_provider(
5203            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5204            1,
5205            1,
5206        )
5207        .await;
5208        let plan =
5209            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5210                .await
5211                .unwrap();
5212
5213        let expected = String::from(
5214            "Filter: TEMPLATE(field_0) IS NOT NULL [timestamp:Timestamp(ms), TEMPLATE(field_0):Float64;N, tag_0:Utf8]\
5215            \n  Projection: some_metric.timestamp, TEMPLATE(some_metric.field_0) AS TEMPLATE(field_0), some_metric.tag_0 [timestamp:Timestamp(ms), TEMPLATE(field_0):Float64;N, tag_0:Utf8]\
5216            \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5217            \n      PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5218            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5219	            \n          Filter: some_metric.tag_0 != Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
5220            \n            TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]"
5221        ).replace("TEMPLATE", plan_name);
5222
5223        assert_eq!(plan.display_indent_schema().to_string(), expected);
5224    }
5225
5226    #[tokio::test]
5227    async fn single_abs() {
5228        do_single_instant_function_call("abs", "abs").await;
5229    }
5230
5231    #[tokio::test]
5232    #[should_panic]
5233    async fn single_absent() {
5234        do_single_instant_function_call("absent", "").await;
5235    }
5236
5237    #[tokio::test]
5238    async fn single_ceil() {
5239        do_single_instant_function_call("ceil", "ceil").await;
5240    }
5241
5242    #[tokio::test]
5243    async fn single_exp() {
5244        do_single_instant_function_call("exp", "exp").await;
5245    }
5246
5247    #[tokio::test]
5248    async fn single_ln() {
5249        do_single_instant_function_call("ln", "ln").await;
5250    }
5251
5252    #[tokio::test]
5253    async fn single_log2() {
5254        do_single_instant_function_call("log2", "log2").await;
5255    }
5256
5257    #[tokio::test]
5258    async fn single_log10() {
5259        do_single_instant_function_call("log10", "log10").await;
5260    }
5261
5262    #[tokio::test]
5263    #[should_panic]
5264    async fn single_scalar() {
5265        do_single_instant_function_call("scalar", "").await;
5266    }
5267
5268    #[tokio::test]
5269    #[should_panic]
5270    async fn single_sgn() {
5271        do_single_instant_function_call("sgn", "").await;
5272    }
5273
5274    #[tokio::test]
5275    #[should_panic]
5276    async fn single_sort() {
5277        do_single_instant_function_call("sort", "").await;
5278    }
5279
5280    #[tokio::test]
5281    #[should_panic]
5282    async fn single_sort_desc() {
5283        do_single_instant_function_call("sort_desc", "").await;
5284    }
5285
5286    #[tokio::test]
5287    async fn single_sqrt() {
5288        do_single_instant_function_call("sqrt", "sqrt").await;
5289    }
5290
5291    #[tokio::test]
5292    #[should_panic]
5293    async fn single_timestamp() {
5294        do_single_instant_function_call("timestamp", "").await;
5295    }
5296
5297    #[tokio::test]
5298    async fn single_acos() {
5299        do_single_instant_function_call("acos", "acos").await;
5300    }
5301
5302    #[tokio::test]
5303    #[should_panic]
5304    async fn single_acosh() {
5305        do_single_instant_function_call("acosh", "").await;
5306    }
5307
5308    #[tokio::test]
5309    async fn single_asin() {
5310        do_single_instant_function_call("asin", "asin").await;
5311    }
5312
5313    #[tokio::test]
5314    #[should_panic]
5315    async fn single_asinh() {
5316        do_single_instant_function_call("asinh", "").await;
5317    }
5318
5319    #[tokio::test]
5320    async fn single_atan() {
5321        do_single_instant_function_call("atan", "atan").await;
5322    }
5323
5324    #[tokio::test]
5325    #[should_panic]
5326    async fn single_atanh() {
5327        do_single_instant_function_call("atanh", "").await;
5328    }
5329
5330    #[tokio::test]
5331    async fn single_cos() {
5332        do_single_instant_function_call("cos", "cos").await;
5333    }
5334
5335    #[tokio::test]
5336    #[should_panic]
5337    async fn single_cosh() {
5338        do_single_instant_function_call("cosh", "").await;
5339    }
5340
5341    #[tokio::test]
5342    async fn single_sin() {
5343        do_single_instant_function_call("sin", "sin").await;
5344    }
5345
5346    #[tokio::test]
5347    #[should_panic]
5348    async fn single_sinh() {
5349        do_single_instant_function_call("sinh", "").await;
5350    }
5351
5352    #[tokio::test]
5353    async fn single_tan() {
5354        do_single_instant_function_call("tan", "tan").await;
5355    }
5356
5357    #[tokio::test]
5358    #[should_panic]
5359    async fn single_tanh() {
5360        do_single_instant_function_call("tanh", "").await;
5361    }
5362
5363    #[tokio::test]
5364    #[should_panic]
5365    async fn single_deg() {
5366        do_single_instant_function_call("deg", "").await;
5367    }
5368
5369    #[tokio::test]
5370    #[should_panic]
5371    async fn single_rad() {
5372        do_single_instant_function_call("rad", "").await;
5373    }
5374
5375    // {
5376    //     input: "avg by (foo)(some_metric)",
5377    //     expected: &AggregateExpr{
5378    //         Op: AVG,
5379    //         Expr: &VectorSelector{
5380    //             Name: "some_metric",
5381    //             LabelMatchers: []*labels.Matcher{
5382    //                 MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
5383    //             },
5384    //             PosRange: PositionRange{
5385    //                 Start: 13,
5386    //                 End:   24,
5387    //             },
5388    //         },
5389    //         Grouping: []string{"foo"},
5390    //         PosRange: PositionRange{
5391    //             Start: 0,
5392    //             End:   25,
5393    //         },
5394    //     },
5395    // },
5396    async fn do_aggregate_expr_plan(fn_name: &str, plan_name: &str) {
5397        let prom_expr = parser::parse(&format!(
5398            "{fn_name} by (tag_1)(some_metric{{tag_0!=\"bar\"}})",
5399        ))
5400        .unwrap();
5401        let mut eval_stmt = EvalStmt {
5402            expr: prom_expr,
5403            start: UNIX_EPOCH,
5404            end: UNIX_EPOCH
5405                .checked_add(Duration::from_secs(100_000))
5406                .unwrap(),
5407            interval: Duration::from_secs(5),
5408            lookback_delta: Duration::from_secs(1),
5409        };
5410
5411        // test group by
5412        let table_provider = build_test_table_provider(
5413            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5414            2,
5415            2,
5416        )
5417        .await;
5418        let plan =
5419            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5420                .await
5421                .unwrap();
5422        let expected_no_without = String::from(
5423            "Sort: some_metric.tag_1 ASC NULLS LAST, some_metric.timestamp ASC NULLS LAST [tag_1:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
5424            \n  Aggregate: groupBy=[[some_metric.tag_1, some_metric.timestamp]], aggr=[[TEMPLATE(some_metric.field_0), TEMPLATE(some_metric.field_1)]] [tag_1:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
5425            \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5426            \n      PromSeriesDivide: tags=[\"tag_0\", \"tag_1\"] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5427            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.tag_1 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5428            \n          Filter: some_metric.tag_0 != Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5429            \n            TableScan: some_metric [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]"
5430        ).replace("TEMPLATE", plan_name);
5431        assert_eq!(
5432            plan.display_indent_schema().to_string(),
5433            expected_no_without
5434        );
5435
5436        // test group without
5437        if let PromExpr::Aggregate(AggregateExpr { modifier, .. }) = &mut eval_stmt.expr {
5438            *modifier = Some(LabelModifier::Exclude(Labels {
5439                labels: vec![String::from("tag_1")].into_iter().collect(),
5440            }));
5441        }
5442        let table_provider = build_test_table_provider(
5443            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5444            2,
5445            2,
5446        )
5447        .await;
5448        let plan =
5449            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5450                .await
5451                .unwrap();
5452        let expected_without = String::from(
5453            "Sort: some_metric.tag_0 ASC NULLS LAST, some_metric.timestamp ASC NULLS LAST [tag_0:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
5454            \n  Aggregate: groupBy=[[some_metric.tag_0, some_metric.timestamp]], aggr=[[TEMPLATE(some_metric.field_0), TEMPLATE(some_metric.field_1)]] [tag_0:Utf8, timestamp:Timestamp(ms), TEMPLATE(some_metric.field_0):Float64;N, TEMPLATE(some_metric.field_1):Float64;N]\
5455            \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5456            \n      PromSeriesDivide: tags=[\"tag_0\", \"tag_1\"] [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5457            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.tag_1 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5458            \n          Filter: some_metric.tag_0 != Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]\
5459            \n            TableScan: some_metric [tag_0:Utf8, tag_1:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N]"
5460        ).replace("TEMPLATE", plan_name);
5461        assert_eq!(plan.display_indent_schema().to_string(), expected_without);
5462    }
5463
5464    #[tokio::test]
5465    async fn aggregate_sum() {
5466        do_aggregate_expr_plan("sum", "sum").await;
5467    }
5468
5469    #[tokio::test]
5470    async fn tsid_is_used_for_series_divide_when_available() {
5471        let prom_expr = parser::parse("some_metric").unwrap();
5472        let eval_stmt = EvalStmt {
5473            expr: prom_expr,
5474            start: UNIX_EPOCH,
5475            end: UNIX_EPOCH
5476                .checked_add(Duration::from_secs(100_000))
5477                .unwrap(),
5478            interval: Duration::from_secs(5),
5479            lookback_delta: Duration::from_secs(1),
5480        };
5481
5482        let table_provider = build_test_table_provider_with_tsid(
5483            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5484            1,
5485            1,
5486        )
5487        .await;
5488        let plan =
5489            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5490                .await
5491                .unwrap();
5492
5493        let plan_str = plan.display_indent_schema().to_string();
5494        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
5495        assert!(plan_str.contains("__tsid ASC NULLS FIRST"));
5496        assert!(
5497            !plan
5498                .schema()
5499                .fields()
5500                .iter()
5501                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
5502        );
5503
5504        let manipulate = find_instant_manipulate(&plan).unwrap();
5505        let exec = manipulate.to_execution_plan(Arc::new(DataSourceExec::new(Arc::new(
5506            MemorySourceConfig::try_new(&[], Arc::new(ArrowSchema::empty()), None).unwrap(),
5507        ))));
5508        assert!(format!("{exec:?}").contains("reuse_tsid_column: true"));
5509    }
5510
5511    #[tokio::test]
5512    async fn default_binary_join_uses_tsid_when_available() {
5513        let eval_stmt = build_eval_stmt("some_metric / some_alt_metric");
5514
5515        let table_provider = build_test_table_provider_with_tsid(
5516            &[
5517                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5518                (
5519                    DEFAULT_SCHEMA_NAME.to_string(),
5520                    "some_alt_metric".to_string(),
5521                ),
5522            ],
5523            1,
5524            1,
5525        )
5526        .await;
5527        let plan =
5528            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5529                .await
5530                .unwrap();
5531
5532        let plan_str = plan.display_indent_schema().to_string();
5533        assert!(
5534            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
5535            "{plan_str}"
5536        );
5537        assert!(
5538            !plan_str.contains("some_metric.tag_0 = some_alt_metric.tag_0"),
5539            "{plan_str}"
5540        );
5541    }
5542
5543    #[tokio::test]
5544    async fn timestamp_binary_join_falls_back_when_tsid_is_projected_out() {
5545        for query in [
5546            "timestamp(some_metric) / some_metric",
5547            "some_metric / timestamp(some_metric)",
5548        ] {
5549            let eval_stmt = build_eval_stmt(query);
5550
5551            let table_provider = build_test_table_provider_with_tsid(
5552                &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5553                1,
5554                1,
5555            )
5556            .await;
5557            let plan =
5558                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5559                    .await
5560                    .unwrap();
5561
5562            let plan_str = plan.display_indent_schema().to_string();
5563            assert!(!plan_str.contains("__tsid ="), "{query}: {plan_str}");
5564            assert!(
5565                plan_str.contains("lhs.tag_0 = rhs.tag_0"),
5566                "{query}: {plan_str}"
5567            );
5568            assert!(
5569                !plan
5570                    .schema()
5571                    .fields()
5572                    .iter()
5573                    .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME),
5574                "{query}: {plan_str}"
5575            );
5576        }
5577    }
5578
5579    #[tokio::test]
5580    async fn timestamp_binary_join_rejects_default_matching_on_mismatched_labels() {
5581        let eval_stmt = build_eval_stmt("timestamp(left_host_job) / right_by_job");
5582
5583        let table_provider = build_test_table_provider_with_tsid_tag_fields(&[
5584            (
5585                (DEFAULT_SCHEMA_NAME.to_string(), "left_host_job".to_string()),
5586                2,
5587                1,
5588            ),
5589            (
5590                (DEFAULT_SCHEMA_NAME.to_string(), "right_by_job".to_string()),
5591                1,
5592                1,
5593            ),
5594        ])
5595        .await;
5596        let plan =
5597            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5598                .await
5599                .unwrap();
5600        let plan_str = plan.display_indent_schema().to_string();
5601
5602        assert!(
5603            plan_str.contains("Boolean(false)") || plan_str.contains("false"),
5604            "{plan_str}"
5605        );
5606    }
5607
5608    #[tokio::test]
5609    async fn tsid_is_preserved_for_nested_default_binary_joins() {
5610        let eval_stmt = build_eval_stmt("(some_metric - some_alt_metric) / some_third_metric");
5611
5612        let table_provider = build_test_table_provider_with_tsid(
5613            &[
5614                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5615                (
5616                    DEFAULT_SCHEMA_NAME.to_string(),
5617                    "some_alt_metric".to_string(),
5618                ),
5619                (
5620                    DEFAULT_SCHEMA_NAME.to_string(),
5621                    "some_third_metric".to_string(),
5622                ),
5623            ],
5624            1,
5625            1,
5626        )
5627        .await;
5628        let plan =
5629            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5630                .await
5631                .unwrap();
5632
5633        let plan_str = plan.display_indent_schema().to_string();
5634        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
5635        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
5636    }
5637
5638    #[tokio::test]
5639    async fn repeated_tsid_binary_operand_reuses_leaf_plan() {
5640        let eval_stmt = build_eval_stmt("((some_metric - some_alt_metric) / some_metric) * 100");
5641
5642        let table_provider = build_test_table_provider_with_tsid(
5643            &[
5644                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5645                (
5646                    DEFAULT_SCHEMA_NAME.to_string(),
5647                    "some_alt_metric".to_string(),
5648                ),
5649            ],
5650            1,
5651            1,
5652        )
5653        .await;
5654        let plan =
5655            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5656                .await
5657                .unwrap();
5658
5659        let plan_str = plan.display_indent_schema().to_string();
5660        assert_eq!(plan_str.matches("__tsid =").count(), 1, "{plan_str}");
5661        assert_eq!(
5662            plan_str
5663                .matches("Filter: phy.__table_id = UInt32(1024)")
5664                .count(),
5665            1,
5666            "{plan_str}"
5667        );
5668        assert_eq!(
5669            plan_str.matches("PromInstantManipulate").count(),
5670            2,
5671            "{plan_str}"
5672        );
5673        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
5674    }
5675
5676    #[tokio::test]
5677    async fn repeated_tsid_binary_operand_reuses_shorter_field_side() {
5678        let eval_stmt =
5679            build_eval_stmt("((two_field_metric - one_field_metric) / one_field_metric) * 100");
5680
5681        let table_provider = build_test_table_provider_with_tsid_fields(
5682            &[
5683                (
5684                    (
5685                        DEFAULT_SCHEMA_NAME.to_string(),
5686                        "two_field_metric".to_string(),
5687                    ),
5688                    2,
5689                ),
5690                (
5691                    (
5692                        DEFAULT_SCHEMA_NAME.to_string(),
5693                        "one_field_metric".to_string(),
5694                    ),
5695                    1,
5696                ),
5697            ],
5698            1,
5699        )
5700        .await;
5701        let plan =
5702            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5703                .await
5704                .unwrap();
5705
5706        let field_names = plan
5707            .schema()
5708            .fields()
5709            .iter()
5710            .map(|field| field.name().clone())
5711            .collect::<Vec<_>>();
5712        let value_columns = field_names
5713            .iter()
5714            .filter(|name| {
5715                *name != "tag_0" && *name != "timestamp" && *name != DATA_SCHEMA_TSID_COLUMN_NAME
5716            })
5717            .count();
5718        assert_eq!(value_columns, 1, "{field_names:?}");
5719        let plan_str = plan.display_indent_schema().to_string();
5720        assert_eq!(plan_str.matches("__tsid =").count(), 1, "{plan_str}");
5721        assert_eq!(
5722            plan_str
5723                .matches("Filter: phy.__table_id = UInt32(1025)")
5724                .count(),
5725            1,
5726            "{plan_str}"
5727        );
5728        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
5729    }
5730
5731    #[tokio::test]
5732    async fn binary_island_reuses_self_operand_without_join() {
5733        let eval_stmt = build_eval_stmt("some_metric / some_metric");
5734
5735        let table_provider = build_test_table_provider_with_tsid(
5736            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
5737            1,
5738            1,
5739        )
5740        .await;
5741        let plan =
5742            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5743                .await
5744                .unwrap();
5745
5746        let plan_str = plan.display_indent_schema().to_string();
5747        assert_eq!(plan_str.matches("__tsid =").count(), 0, "{plan_str}");
5748        assert_eq!(
5749            plan_str
5750                .matches("Filter: phy.__table_id = UInt32(1024)")
5751                .count(),
5752            1,
5753            "{plan_str}"
5754        );
5755        assert_eq!(
5756            plan_str.matches("PromInstantManipulate").count(),
5757            1,
5758            "{plan_str}"
5759        );
5760    }
5761
5762    #[tokio::test]
5763    async fn binary_island_reuses_leaf_across_two_branches() {
5764        let eval_stmt =
5765            build_eval_stmt("(some_metric + some_alt_metric) / (some_metric + third_metric)");
5766
5767        let table_provider = build_test_table_provider_with_tsid(
5768            &[
5769                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5770                (
5771                    DEFAULT_SCHEMA_NAME.to_string(),
5772                    "some_alt_metric".to_string(),
5773                ),
5774                (DEFAULT_SCHEMA_NAME.to_string(), "third_metric".to_string()),
5775            ],
5776            1,
5777            1,
5778        )
5779        .await;
5780        let plan =
5781            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5782                .await
5783                .unwrap();
5784
5785        let plan_str = plan.display_indent_schema().to_string();
5786        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
5787        assert_eq!(
5788            plan_str
5789                .matches("Filter: phy.__table_id = UInt32(1024)")
5790                .count(),
5791            1,
5792            "{plan_str}"
5793        );
5794        assert_eq!(
5795            plan_str.matches("PromInstantManipulate").count(),
5796            3,
5797            "{plan_str}"
5798        );
5799    }
5800
5801    #[tokio::test]
5802    async fn binary_island_generated_alias_avoids_user_column_names() {
5803        let eval_stmt = build_eval_stmt("(some_metric + some_alt_metric) / some_metric");
5804
5805        let table_provider = build_test_table_provider_with_fields(
5806            &[
5807                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5808                (
5809                    DEFAULT_SCHEMA_NAME.to_string(),
5810                    "some_alt_metric".to_string(),
5811                ),
5812            ],
5813            &["prom_v0", "__prom_v0"],
5814        )
5815        .await;
5816        let plan =
5817            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5818                .await
5819                .unwrap();
5820
5821        let field_names = plan.schema().field_names();
5822        assert!(field_names.iter().any(|name| name.ends_with(".prom_v0")));
5823        assert!(field_names.iter().any(|name| name.ends_with(".__prom_v0")));
5824
5825        let plan_str = plan.display_indent_schema().to_string();
5826        assert!(plan_str.contains("SubqueryAlias: __prom_v0"), "{plan_str}");
5827        assert_eq!(
5828            plan_str.matches("PromInstantManipulate").count(),
5829            2,
5830            "{plan_str}"
5831        );
5832    }
5833
5834    #[tokio::test]
5835    async fn binary_island_clears_qualifier_for_nested_unary_projection() {
5836        let eval_stmt = build_eval_stmt("-((some_metric + some_alt_metric) / some_metric)");
5837
5838        let table_provider = build_test_table_provider_with_tsid(
5839            &[
5840                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5841                (
5842                    DEFAULT_SCHEMA_NAME.to_string(),
5843                    "some_alt_metric".to_string(),
5844                ),
5845            ],
5846            1,
5847            1,
5848        )
5849        .await;
5850        let plan =
5851            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5852                .await
5853                .unwrap();
5854
5855        let plan_str = plan.display_indent_schema().to_string();
5856        assert_eq!(plan_str.matches("__tsid =").count(), 1, "{plan_str}");
5857        assert_eq!(
5858            plan_str.matches("PromInstantManipulate").count(),
5859            2,
5860            "{plan_str}"
5861        );
5862    }
5863
5864    #[tokio::test]
5865    async fn binary_island_keeps_distinct_matcher_leaves() {
5866        let eval_stmt = build_eval_stmt(
5867            "(some_metric{tag_0=\"foo\"} + some_alt_metric) / some_metric{tag_0=\"bar\"}",
5868        );
5869
5870        let table_provider = build_test_table_provider_with_tsid(
5871            &[
5872                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5873                (
5874                    DEFAULT_SCHEMA_NAME.to_string(),
5875                    "some_alt_metric".to_string(),
5876                ),
5877            ],
5878            1,
5879            1,
5880        )
5881        .await;
5882        let plan =
5883            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5884                .await
5885                .unwrap();
5886
5887        let plan_str = plan.display_indent_schema().to_string();
5888        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
5889        assert_eq!(
5890            plan_str.matches("PromInstantManipulate").count(),
5891            3,
5892            "{plan_str}"
5893        );
5894    }
5895
5896    #[tokio::test]
5897    async fn binary_island_keeps_offset_leaves_distinct() {
5898        let eval_stmt = build_eval_stmt("(some_metric offset 5m + some_alt_metric) / some_metric");
5899
5900        let table_provider = build_test_table_provider_with_tsid(
5901            &[
5902                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5903                (
5904                    DEFAULT_SCHEMA_NAME.to_string(),
5905                    "some_alt_metric".to_string(),
5906                ),
5907            ],
5908            1,
5909            1,
5910        )
5911        .await;
5912        let plan =
5913            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5914                .await
5915                .unwrap();
5916
5917        let plan_str = plan.display_indent_schema().to_string();
5918        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
5919        assert_eq!(
5920            plan_str.matches("PromInstantManipulate").count(),
5921            3,
5922            "{plan_str}"
5923        );
5924    }
5925
5926    #[tokio::test]
5927    async fn binary_island_falls_back_for_group_modifier() {
5928        let eval_stmt = build_eval_stmt(
5929            "(some_metric + ignoring(tag_0) group_left some_alt_metric) / some_metric",
5930        );
5931
5932        let table_provider = build_test_table_provider_with_tsid(
5933            &[
5934                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5935                (
5936                    DEFAULT_SCHEMA_NAME.to_string(),
5937                    "some_alt_metric".to_string(),
5938                ),
5939            ],
5940            1,
5941            1,
5942        )
5943        .await;
5944        let plan =
5945            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5946                .await
5947                .unwrap();
5948
5949        let plan_str = plan.display_indent_schema().to_string();
5950        assert_eq!(
5951            plan_str.matches("PromInstantManipulate").count(),
5952            3,
5953            "{plan_str}"
5954        );
5955    }
5956
5957    #[tokio::test]
5958    async fn binary_island_falls_back_for_comparison_filter() {
5959        let eval_stmt = build_eval_stmt("(some_metric > some_alt_metric) / some_metric");
5960
5961        let table_provider = build_test_table_provider_with_tsid(
5962            &[
5963                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
5964                (
5965                    DEFAULT_SCHEMA_NAME.to_string(),
5966                    "some_alt_metric".to_string(),
5967                ),
5968            ],
5969            1,
5970            1,
5971        )
5972        .await;
5973        let plan =
5974            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
5975                .await
5976                .unwrap();
5977
5978        let plan_str = plan.display_indent_schema().to_string();
5979        assert_eq!(plan_str.matches("__tsid =").count(), 2, "{plan_str}");
5980        assert_eq!(
5981            plan_str.matches("PromInstantManipulate").count(),
5982            3,
5983            "{plan_str}"
5984        );
5985    }
5986
5987    #[tokio::test]
5988    async fn tsid_binary_join_uses_shorter_field_side() {
5989        let eval_stmt = build_eval_stmt("one_field_metric / two_field_metric");
5990
5991        let table_provider = build_test_table_provider_with_tsid_fields(
5992            &[
5993                (
5994                    (
5995                        DEFAULT_SCHEMA_NAME.to_string(),
5996                        "one_field_metric".to_string(),
5997                    ),
5998                    1,
5999                ),
6000                (
6001                    (
6002                        DEFAULT_SCHEMA_NAME.to_string(),
6003                        "two_field_metric".to_string(),
6004                    ),
6005                    2,
6006                ),
6007            ],
6008            1,
6009        )
6010        .await;
6011        let plan =
6012            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6013                .await
6014                .unwrap();
6015
6016        let field_names = plan
6017            .schema()
6018            .fields()
6019            .iter()
6020            .map(|field| field.name().clone())
6021            .collect::<Vec<_>>();
6022        let value_columns = field_names
6023            .iter()
6024            .filter(|name| {
6025                *name != "tag_0" && *name != "timestamp" && *name != DATA_SCHEMA_TSID_COLUMN_NAME
6026            })
6027            .count();
6028        assert_eq!(value_columns, 1, "{field_names:?}");
6029    }
6030
6031    #[tokio::test]
6032    async fn comparison_binary_join_uses_shorter_field_side() {
6033        let eval_stmt = build_eval_stmt("two_field_metric > one_field_metric");
6034
6035        let table_provider = build_test_table_provider_with_tsid_fields(
6036            &[
6037                (
6038                    (
6039                        DEFAULT_SCHEMA_NAME.to_string(),
6040                        "two_field_metric".to_string(),
6041                    ),
6042                    2,
6043                ),
6044                (
6045                    (
6046                        DEFAULT_SCHEMA_NAME.to_string(),
6047                        "one_field_metric".to_string(),
6048                    ),
6049                    1,
6050                ),
6051            ],
6052            1,
6053        )
6054        .await;
6055        let plan =
6056            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6057                .await
6058                .unwrap();
6059
6060        let field_names = plan
6061            .schema()
6062            .fields()
6063            .iter()
6064            .map(|field| field.name().clone())
6065            .collect::<Vec<_>>();
6066        assert!(
6067            field_names.iter().any(|name| name == "field_0"),
6068            "{field_names:?}"
6069        );
6070        assert!(
6071            !field_names.iter().any(|name| name == "field_1"),
6072            "{field_names:?}"
6073        );
6074    }
6075
6076    #[tokio::test]
6077    async fn label_matching_modifier_disables_tsid_binary_join() {
6078        let eval_stmt = build_eval_stmt("some_metric / ignoring(tag_0) some_alt_metric");
6079
6080        let table_provider = build_test_table_provider_with_tsid(
6081            &[
6082                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6083                (
6084                    DEFAULT_SCHEMA_NAME.to_string(),
6085                    "some_alt_metric".to_string(),
6086                ),
6087            ],
6088            2,
6089            1,
6090        )
6091        .await;
6092        let plan =
6093            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6094                .await
6095                .unwrap();
6096
6097        let plan_str = plan.display_indent_schema().to_string();
6098        assert!(!plan_str.contains("__tsid ="), "{plan_str}");
6099        assert!(
6100            plan_str.contains("some_metric.tag_1 = some_alt_metric.tag_1"),
6101            "{plan_str}"
6102        );
6103    }
6104
6105    #[tokio::test]
6106    async fn ignoring_absent_label_keeps_tsid_binary_join() {
6107        let eval_stmt = build_eval_stmt("some_metric / ignoring(missing) some_alt_metric");
6108
6109        let table_provider = build_test_table_provider_with_tsid(
6110            &[
6111                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6112                (
6113                    DEFAULT_SCHEMA_NAME.to_string(),
6114                    "some_alt_metric".to_string(),
6115                ),
6116            ],
6117            2,
6118            1,
6119        )
6120        .await;
6121        let plan =
6122            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6123                .await
6124                .unwrap();
6125
6126        let plan_str = plan.display_indent_schema().to_string();
6127        assert!(
6128            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
6129            "{plan_str}"
6130        );
6131        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
6132        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
6133    }
6134
6135    #[tokio::test]
6136    async fn range_function_keeps_tsid_for_absent_ignoring_binary_join() {
6137        let eval_stmt =
6138            build_eval_stmt("rate(some_metric[5m]) / ignoring(missing) some_alt_metric");
6139
6140        let table_provider = build_test_table_provider_with_tsid(
6141            &[
6142                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6143                (
6144                    DEFAULT_SCHEMA_NAME.to_string(),
6145                    "some_alt_metric".to_string(),
6146                ),
6147            ],
6148            2,
6149            1,
6150        )
6151        .await;
6152        let plan =
6153            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6154                .await
6155                .unwrap();
6156
6157        let plan_str = plan.display_indent_schema().to_string();
6158        assert!(
6159            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
6160            "{plan_str}"
6161        );
6162        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
6163        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
6164    }
6165
6166    #[tokio::test]
6167    async fn on_full_label_set_keeps_tsid_binary_join() {
6168        let eval_stmt = build_eval_stmt("some_metric / on(tag_0, tag_1) some_alt_metric");
6169
6170        let table_provider = build_test_table_provider_with_tsid(
6171            &[
6172                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6173                (
6174                    DEFAULT_SCHEMA_NAME.to_string(),
6175                    "some_alt_metric".to_string(),
6176                ),
6177            ],
6178            2,
6179            1,
6180        )
6181        .await;
6182        let plan =
6183            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6184                .await
6185                .unwrap();
6186
6187        let plan_str = plan.display_indent_schema().to_string();
6188        assert!(
6189            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
6190            "{plan_str}"
6191        );
6192        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
6193        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
6194    }
6195
6196    #[tokio::test]
6197    async fn on_partial_label_set_disables_tsid_binary_join() {
6198        let eval_stmt = build_eval_stmt("some_metric / on(tag_0) some_alt_metric");
6199
6200        let table_provider = build_test_table_provider_with_tsid(
6201            &[
6202                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6203                (
6204                    DEFAULT_SCHEMA_NAME.to_string(),
6205                    "some_alt_metric".to_string(),
6206                ),
6207            ],
6208            2,
6209            1,
6210        )
6211        .await;
6212        let plan =
6213            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6214                .await
6215                .unwrap();
6216
6217        let plan_str = plan.display_indent_schema().to_string();
6218        assert!(!plan_str.contains("__tsid ="), "{plan_str}");
6219        assert!(
6220            plan_str.contains("some_metric.tag_0 = some_alt_metric.tag_0"),
6221            "{plan_str}"
6222        );
6223        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
6224    }
6225
6226    #[tokio::test]
6227    async fn on_label_set_must_cover_both_sides_to_use_tsid_binary_join() {
6228        let eval_stmt = build_eval_stmt("some_metric / on(tag_0) some_alt_metric");
6229
6230        let table_provider = build_test_table_provider_with_tsid_tag_fields(&[
6231            (
6232                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6233                2,
6234                1,
6235            ),
6236            (
6237                (
6238                    DEFAULT_SCHEMA_NAME.to_string(),
6239                    "some_alt_metric".to_string(),
6240                ),
6241                1,
6242                1,
6243            ),
6244        ])
6245        .await;
6246        let plan =
6247            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6248                .await
6249                .unwrap();
6250
6251        let plan_str = plan.display_indent_schema().to_string();
6252        assert!(!plan_str.contains("__tsid ="), "{plan_str}");
6253        assert!(
6254            plan_str.contains("some_metric.tag_0 = some_alt_metric.tag_0"),
6255            "{plan_str}"
6256        );
6257        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
6258    }
6259
6260    #[tokio::test]
6261    async fn comparison_binary_join_uses_tsid_and_keeps_it_in_filtered_result() {
6262        let eval_stmt = build_eval_stmt("some_metric > some_alt_metric");
6263
6264        let table_provider = build_test_table_provider_with_tsid(
6265            &[
6266                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6267                (
6268                    DEFAULT_SCHEMA_NAME.to_string(),
6269                    "some_alt_metric".to_string(),
6270                ),
6271            ],
6272            2,
6273            1,
6274        )
6275        .await;
6276        let mut planner = PromPlanner {
6277            table_provider,
6278            ctx: PromPlannerContext::from_eval_stmt(&eval_stmt),
6279        };
6280        let plan = planner
6281            .prom_expr_to_plan(&eval_stmt.expr, &build_query_engine_state())
6282            .await
6283            .unwrap();
6284
6285        let plan_str = plan.display_indent_schema().to_string();
6286        assert!(
6287            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
6288            "{plan_str}"
6289        );
6290        assert!(
6291            plan.schema()
6292                .fields()
6293                .iter()
6294                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME),
6295            "{plan_str}"
6296        );
6297        assert!(planner.ctx.use_tsid, "{plan_str}");
6298    }
6299
6300    #[tokio::test]
6301    async fn comparison_bool_binary_join_uses_tsid_when_available() {
6302        let eval_stmt = build_eval_stmt("some_metric > bool some_alt_metric");
6303
6304        let table_provider = build_test_table_provider_with_tsid(
6305            &[
6306                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6307                (
6308                    DEFAULT_SCHEMA_NAME.to_string(),
6309                    "some_alt_metric".to_string(),
6310                ),
6311            ],
6312            2,
6313            1,
6314        )
6315        .await;
6316        let plan =
6317            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6318                .await
6319                .unwrap();
6320
6321        let plan_str = plan.display_indent_schema().to_string();
6322        assert!(
6323            plan_str.contains("some_metric.__tsid = some_alt_metric.__tsid"),
6324            "{plan_str}"
6325        );
6326        assert!(!plan_str.contains("tag_0 ="), "{plan_str}");
6327        assert!(!plan_str.contains("tag_1 ="), "{plan_str}");
6328    }
6329
6330    #[tokio::test]
6331    async fn scalar_count_count_range_keeps_full_window() {
6332        let plan_str = build_optimized_tsid_plan(
6333            "scalar(count(count(some_metric) by (tag_0)))",
6334            1,
6335            1,
6336            100_000,
6337            1,
6338        )
6339        .await;
6340        assert!(plan_str.contains("ScalarCalculate: tags=[]"));
6341        assert!(plan_str.contains("PromInstantManipulate: range=[0..100000000]"));
6342        assert!(!plan_str.contains("PromInstantManipulate: range=[99999000..99999000]"));
6343    }
6344
6345    #[tokio::test]
6346    async fn scalar_count_count_rewrite_applies_inside_binary_expr_for_tsid_input() {
6347        let plan_str = build_optimized_tsid_plan(
6348            "sum(irate(some_metric[1h])) / scalar(count(count(some_metric) by (tag_0)))",
6349            2,
6350            1,
6351            10,
6352            300,
6353        )
6354        .await;
6355        assert!(plan_str.contains("Distinct:"), "{plan_str}");
6356    }
6357
6358    #[tokio::test]
6359    async fn nested_count_rewrite_keeps_full_series_key_with_tsid_input() {
6360        assert_nested_count_rewrite_applies(
6361            "count(count(some_metric) by (tag_0))",
6362            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(count(some_metric.field_0))]]"
6363        )
6364        .await;
6365    }
6366
6367    #[tokio::test]
6368    async fn nested_sum_count_rewrite_keeps_full_series_key_with_tsid_input() {
6369        assert_nested_count_rewrite_applies(
6370            "count(sum(some_metric) by (tag_0))",
6371            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(sum(some_metric.field_0))]]"
6372        )
6373        .await;
6374    }
6375
6376    #[tokio::test]
6377    async fn nested_supported_inner_aggs_rewrite_apply_for_tsid_input() {
6378        for (query, expected_outer_agg) in [
6379            (
6380                "count(avg(some_metric) by (tag_0))",
6381                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(avg(some_metric.field_0))]]",
6382            ),
6383            (
6384                "count(min(some_metric) by (tag_0))",
6385                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(min(some_metric.field_0))]]",
6386            ),
6387            (
6388                "count(max(some_metric) by (tag_0))",
6389                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(max(some_metric.field_0))]]",
6390            ),
6391            (
6392                "count(stddev(some_metric) by (tag_0))",
6393                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(stddev_pop(some_metric.field_0))]]",
6394            ),
6395            (
6396                "count(stdvar(some_metric) by (tag_0))",
6397                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(var_pop(some_metric.field_0))]]",
6398            ),
6399        ] {
6400            assert_nested_count_rewrite_applies(query, expected_outer_agg).await;
6401        }
6402    }
6403
6404    #[tokio::test]
6405    async fn nested_non_count_inner_aggs_rewrite_filter_null_values_for_tsid_input() {
6406        let count_plan =
6407            build_optimized_tsid_plan("count(count(some_metric) by (tag_0))", 2, 1, 100_000, 1)
6408                .await;
6409        assert!(
6410            !count_plan.contains("some_metric.field_0 IS NOT NULL"),
6411            "{count_plan}"
6412        );
6413
6414        for query in [
6415            "count(sum(some_metric) by (tag_0))",
6416            "count(avg(some_metric) by (tag_0))",
6417            "count(min(some_metric) by (tag_0))",
6418            "count(max(some_metric) by (tag_0))",
6419            "count(stddev(some_metric) by (tag_0))",
6420            "count(stdvar(some_metric) by (tag_0))",
6421        ] {
6422            let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
6423            assert!(
6424                plan_str.contains("Filter: some_metric.field_0 IS NOT NULL"),
6425                "{query}: {plan_str}"
6426            );
6427        }
6428    }
6429
6430    #[tokio::test]
6431    async fn nested_unsupported_or_non_direct_inner_aggs_do_not_rewrite() {
6432        assert_nested_count_rewrite_missing("count(group(some_metric) by (tag_0))", 2, 1).await;
6433        assert_nested_count_rewrite_missing(
6434            "count(sum(irate(some_metric[1h])) by (tag_0))",
6435            2,
6436            300,
6437        )
6438        .await;
6439    }
6440
6441    #[tokio::test]
6442    async fn physical_table_name_is_not_leaked_in_plan() {
6443        let prom_expr = parser::parse("some_metric").unwrap();
6444        let eval_stmt = EvalStmt {
6445            expr: prom_expr,
6446            start: UNIX_EPOCH,
6447            end: UNIX_EPOCH
6448                .checked_add(Duration::from_secs(100_000))
6449                .unwrap(),
6450            interval: Duration::from_secs(5),
6451            lookback_delta: Duration::from_secs(1),
6452        };
6453
6454        let table_provider = build_test_table_provider_with_tsid(
6455            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6456            1,
6457            1,
6458        )
6459        .await;
6460        let plan =
6461            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6462                .await
6463                .unwrap();
6464
6465        let plan_str = plan.display_indent_schema().to_string();
6466        assert!(plan_str.contains("TableScan: phy"), "{plan}");
6467        assert!(plan_str.contains("SubqueryAlias: some_metric"));
6468        assert!(plan_str.contains("Filter: phy.__table_id = UInt32(1024)"));
6469        assert!(!plan_str.contains("TableScan: some_metric"));
6470    }
6471
6472    #[tokio::test]
6473    async fn sum_without_does_not_group_by_tsid() {
6474        let prom_expr = parser::parse("sum without (tag_0) (some_metric)").unwrap();
6475        let eval_stmt = EvalStmt {
6476            expr: prom_expr,
6477            start: UNIX_EPOCH,
6478            end: UNIX_EPOCH
6479                .checked_add(Duration::from_secs(100_000))
6480                .unwrap(),
6481            interval: Duration::from_secs(5),
6482            lookback_delta: Duration::from_secs(1),
6483        };
6484
6485        let table_provider = build_test_table_provider_with_tsid(
6486            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6487            1,
6488            1,
6489        )
6490        .await;
6491        let plan =
6492            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6493                .await
6494                .unwrap();
6495
6496        let plan_str = plan.display_indent_schema().to_string();
6497        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
6498
6499        let aggr_line = plan_str
6500            .lines()
6501            .find(|line| line.contains("Aggregate: groupBy="))
6502            .unwrap();
6503        assert!(!aggr_line.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
6504    }
6505
6506    #[tokio::test]
6507    async fn topk_without_does_not_partition_by_tsid() {
6508        let prom_expr = parser::parse("topk without (tag_0) (1, some_metric)").unwrap();
6509        let eval_stmt = EvalStmt {
6510            expr: prom_expr,
6511            start: UNIX_EPOCH,
6512            end: UNIX_EPOCH
6513                .checked_add(Duration::from_secs(100_000))
6514                .unwrap(),
6515            interval: Duration::from_secs(5),
6516            lookback_delta: Duration::from_secs(1),
6517        };
6518
6519        let table_provider = build_test_table_provider_with_tsid(
6520            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6521            1,
6522            1,
6523        )
6524        .await;
6525        let plan =
6526            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6527                .await
6528                .unwrap();
6529
6530        let plan_str = plan.display_indent_schema().to_string();
6531        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
6532
6533        let window_line = plan_str
6534            .lines()
6535            .find(|line| line.contains("WindowAggr: windowExpr=[[row_number()"))
6536            .unwrap();
6537        let partition_by = window_line
6538            .split("PARTITION BY [")
6539            .nth(1)
6540            .and_then(|s| s.split("] ORDER BY").next())
6541            .unwrap();
6542        assert!(!partition_by.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
6543    }
6544
6545    #[tokio::test]
6546    async fn sum_by_does_not_group_by_tsid() {
6547        let prom_expr = parser::parse("sum by (__tsid) (some_metric)").unwrap();
6548        let eval_stmt = EvalStmt {
6549            expr: prom_expr,
6550            start: UNIX_EPOCH,
6551            end: UNIX_EPOCH
6552                .checked_add(Duration::from_secs(100_000))
6553                .unwrap(),
6554            interval: Duration::from_secs(5),
6555            lookback_delta: Duration::from_secs(1),
6556        };
6557
6558        let table_provider = build_test_table_provider_with_tsid(
6559            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6560            1,
6561            1,
6562        )
6563        .await;
6564        let plan =
6565            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6566                .await
6567                .unwrap();
6568
6569        let plan_str = plan.display_indent_schema().to_string();
6570        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
6571
6572        let aggr_line = plan_str
6573            .lines()
6574            .find(|line| line.contains("Aggregate: groupBy="))
6575            .unwrap();
6576        assert!(!aggr_line.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
6577    }
6578
6579    #[tokio::test]
6580    async fn topk_by_does_not_partition_by_tsid() {
6581        let prom_expr = parser::parse("topk by (__tsid) (1, some_metric)").unwrap();
6582        let eval_stmt = EvalStmt {
6583            expr: prom_expr,
6584            start: UNIX_EPOCH,
6585            end: UNIX_EPOCH
6586                .checked_add(Duration::from_secs(100_000))
6587                .unwrap(),
6588            interval: Duration::from_secs(5),
6589            lookback_delta: Duration::from_secs(1),
6590        };
6591
6592        let table_provider = build_test_table_provider_with_tsid(
6593            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6594            1,
6595            1,
6596        )
6597        .await;
6598        let plan =
6599            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6600                .await
6601                .unwrap();
6602
6603        let plan_str = plan.display_indent_schema().to_string();
6604        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
6605
6606        let window_line = plan_str
6607            .lines()
6608            .find(|line| line.contains("WindowAggr: windowExpr=[[row_number()"))
6609            .unwrap();
6610        let partition_by = window_line
6611            .split("PARTITION BY [")
6612            .nth(1)
6613            .and_then(|s| s.split("] ORDER BY").next())
6614            .unwrap();
6615        assert!(!partition_by.contains(DATA_SCHEMA_TSID_COLUMN_NAME));
6616    }
6617
6618    #[tokio::test]
6619    async fn selector_matcher_on_tsid_does_not_use_internal_column() {
6620        let prom_expr = parser::parse(r#"some_metric{__tsid="123"}"#).unwrap();
6621        let eval_stmt = EvalStmt {
6622            expr: prom_expr,
6623            start: UNIX_EPOCH,
6624            end: UNIX_EPOCH
6625                .checked_add(Duration::from_secs(100_000))
6626                .unwrap(),
6627            interval: Duration::from_secs(5),
6628            lookback_delta: Duration::from_secs(1),
6629        };
6630
6631        let table_provider = build_test_table_provider_with_tsid(
6632            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6633            1,
6634            1,
6635        )
6636        .await;
6637        let plan =
6638            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6639                .await
6640                .unwrap();
6641
6642        fn collect_filter_cols(plan: &LogicalPlan, out: &mut HashSet<Column>) {
6643            if let LogicalPlan::Filter(filter) = plan {
6644                datafusion_expr::utils::expr_to_columns(&filter.predicate, out).unwrap();
6645            }
6646            for input in plan.inputs() {
6647                collect_filter_cols(input, out);
6648            }
6649        }
6650
6651        let mut filter_cols = HashSet::new();
6652        collect_filter_cols(&plan, &mut filter_cols);
6653        assert!(
6654            !filter_cols
6655                .iter()
6656                .any(|c| c.name == DATA_SCHEMA_TSID_COLUMN_NAME)
6657        );
6658    }
6659
6660    #[tokio::test]
6661    async fn tsid_is_not_used_when_physical_table_is_missing() {
6662        let prom_expr = parser::parse("some_metric").unwrap();
6663        let eval_stmt = EvalStmt {
6664            expr: prom_expr,
6665            start: UNIX_EPOCH,
6666            end: UNIX_EPOCH
6667                .checked_add(Duration::from_secs(100_000))
6668                .unwrap(),
6669            interval: Duration::from_secs(5),
6670            lookback_delta: Duration::from_secs(1),
6671        };
6672
6673        let catalog_list = MemoryCatalogManager::with_default_setup();
6674
6675        // Register a metric engine logical table referencing a missing physical table.
6676        let mut columns = vec![ColumnSchema::new(
6677            "tag_0".to_string(),
6678            ConcreteDataType::string_datatype(),
6679            false,
6680        )];
6681        columns.push(
6682            ColumnSchema::new(
6683                "timestamp".to_string(),
6684                ConcreteDataType::timestamp_millisecond_datatype(),
6685                false,
6686            )
6687            .with_time_index(true),
6688        );
6689        columns.push(ColumnSchema::new(
6690            "field_0".to_string(),
6691            ConcreteDataType::float64_datatype(),
6692            true,
6693        ));
6694        let schema = Arc::new(Schema::new(columns));
6695        let mut options = table::requests::TableOptions::default();
6696        options
6697            .extra_options
6698            .insert(LOGICAL_TABLE_METADATA_KEY.to_string(), "phy".to_string());
6699        let table_meta = TableMetaBuilder::empty()
6700            .schema(schema)
6701            .primary_key_indices(vec![0])
6702            .value_indices(vec![2])
6703            .engine(METRIC_ENGINE_NAME.to_string())
6704            .options(options)
6705            .next_column_id(1024)
6706            .build()
6707            .unwrap();
6708        let table_info = TableInfoBuilder::default()
6709            .table_id(1024)
6710            .name("some_metric")
6711            .meta(table_meta)
6712            .build()
6713            .unwrap();
6714        let table = EmptyTable::from_table_info(&table_info);
6715        catalog_list
6716            .register_table_sync(RegisterTableRequest {
6717                catalog: DEFAULT_CATALOG_NAME.to_string(),
6718                schema: DEFAULT_SCHEMA_NAME.to_string(),
6719                table_name: "some_metric".to_string(),
6720                table_id: 1024,
6721                table,
6722            })
6723            .unwrap();
6724
6725        let table_provider = DfTableSourceProvider::new(
6726            catalog_list,
6727            false,
6728            QueryContext::arc(),
6729            DummyDecoder::arc(),
6730            false,
6731        );
6732
6733        let plan =
6734            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6735                .await
6736                .unwrap();
6737
6738        let plan_str = plan.display_indent_schema().to_string();
6739        assert!(plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]"));
6740        assert!(!plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
6741    }
6742
6743    #[tokio::test]
6744    async fn tsid_is_carried_only_when_aggregate_preserves_label_set() {
6745        let prom_expr = parser::parse("sum by (tag_0) (some_metric)").unwrap();
6746        let eval_stmt = EvalStmt {
6747            expr: prom_expr,
6748            start: UNIX_EPOCH,
6749            end: UNIX_EPOCH
6750                .checked_add(Duration::from_secs(100_000))
6751                .unwrap(),
6752            interval: Duration::from_secs(5),
6753            lookback_delta: Duration::from_secs(1),
6754        };
6755
6756        let table_provider = build_test_table_provider_with_tsid(
6757            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6758            1,
6759            1,
6760        )
6761        .await;
6762        let plan =
6763            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6764                .await
6765                .unwrap();
6766
6767        let plan_str = plan.display_indent_schema().to_string();
6768        assert!(plan_str.contains("first_value") && plan_str.contains("__tsid"));
6769        assert!(
6770            !plan
6771                .schema()
6772                .fields()
6773                .iter()
6774                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
6775        );
6776
6777        // Merging aggregate: label set is reduced, tsid should not be carried.
6778        let prom_expr = parser::parse("sum(some_metric)").unwrap();
6779        let eval_stmt = EvalStmt {
6780            expr: prom_expr,
6781            start: UNIX_EPOCH,
6782            end: UNIX_EPOCH
6783                .checked_add(Duration::from_secs(100_000))
6784                .unwrap(),
6785            interval: Duration::from_secs(5),
6786            lookback_delta: Duration::from_secs(1),
6787        };
6788        let table_provider = build_test_table_provider_with_tsid(
6789            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6790            1,
6791            1,
6792        )
6793        .await;
6794        let plan =
6795            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6796                .await
6797                .unwrap();
6798        let plan_str = plan.display_indent_schema().to_string();
6799        assert!(!plan_str.contains("first_value"));
6800    }
6801
6802    #[tokio::test]
6803    async fn or_operator_with_unknown_metric_does_not_require_tsid() {
6804        let prom_expr = parser::parse("unknown_metric or some_metric").unwrap();
6805        let eval_stmt = EvalStmt {
6806            expr: prom_expr,
6807            start: UNIX_EPOCH,
6808            end: UNIX_EPOCH
6809                .checked_add(Duration::from_secs(100_000))
6810                .unwrap(),
6811            interval: Duration::from_secs(5),
6812            lookback_delta: Duration::from_secs(1),
6813        };
6814
6815        let table_provider = build_test_table_provider_with_tsid(
6816            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6817            1,
6818            1,
6819        )
6820        .await;
6821
6822        let plan =
6823            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6824                .await
6825                .unwrap();
6826
6827        assert!(
6828            !plan
6829                .schema()
6830                .fields()
6831                .iter()
6832                .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
6833        );
6834    }
6835
6836    #[tokio::test]
6837    async fn aggregate_avg() {
6838        do_aggregate_expr_plan("avg", "avg").await;
6839    }
6840
6841    #[tokio::test]
6842    #[should_panic] // output type doesn't match
6843    async fn aggregate_count() {
6844        do_aggregate_expr_plan("count", "count").await;
6845    }
6846
6847    #[tokio::test]
6848    async fn aggregate_min() {
6849        do_aggregate_expr_plan("min", "min").await;
6850    }
6851
6852    #[tokio::test]
6853    async fn aggregate_max() {
6854        do_aggregate_expr_plan("max", "max").await;
6855    }
6856
6857    #[tokio::test]
6858    async fn aggregate_group() {
6859        // Regression test for `group()` aggregator.
6860        // PromQL: sum(group by (cluster)(kubernetes_build_info{service="kubernetes",job="apiserver"}))
6861        // should be plannable, and `group()` should produce constant 1 for each group.
6862        let prom_expr = parser::parse(
6863            "sum(group by (cluster)(kubernetes_build_info{service=\"kubernetes\",job=\"apiserver\"}))",
6864        )
6865        .unwrap();
6866        let eval_stmt = EvalStmt {
6867            expr: prom_expr,
6868            start: UNIX_EPOCH,
6869            end: UNIX_EPOCH
6870                .checked_add(Duration::from_secs(100_000))
6871                .unwrap(),
6872            interval: Duration::from_secs(5),
6873            lookback_delta: Duration::from_secs(1),
6874        };
6875
6876        let table_provider = build_test_table_provider_with_fields(
6877            &[(
6878                DEFAULT_SCHEMA_NAME.to_string(),
6879                "kubernetes_build_info".to_string(),
6880            )],
6881            &["cluster", "service", "job"],
6882        )
6883        .await;
6884        let plan =
6885            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6886                .await
6887                .unwrap();
6888
6889        let plan_str = plan.display_indent_schema().to_string();
6890        assert!(plan_str.contains("max(Float64(1"));
6891    }
6892
6893    #[tokio::test]
6894    async fn aggregate_stddev() {
6895        do_aggregate_expr_plan("stddev", "stddev_pop").await;
6896    }
6897
6898    #[tokio::test]
6899    async fn aggregate_stdvar() {
6900        do_aggregate_expr_plan("stdvar", "var_pop").await;
6901    }
6902
6903    // TODO(ruihang): add range fn tests once exprs are ready.
6904
6905    // {
6906    //     input: "some_metric{tag_0="foo"} + some_metric{tag_0="bar"}",
6907    //     expected: &BinaryExpr{
6908    //         Op: ADD,
6909    //         LHS: &VectorSelector{
6910    //             Name: "a",
6911    //             LabelMatchers: []*labels.Matcher{
6912    //                     MustLabelMatcher(labels.MatchEqual, "tag_0", "foo"),
6913    //                     MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
6914    //             },
6915    //         },
6916    //         RHS: &VectorSelector{
6917    //             Name: "sum",
6918    //             LabelMatchers: []*labels.Matcher{
6919    //                     MustLabelMatcher(labels.MatchxEqual, "tag_0", "bar"),
6920    //                     MustLabelMatcher(labels.MatchEqual, model.MetricNameLabel, "some_metric"),
6921    //             },
6922    //         },
6923    //         VectorMatching: &VectorMatching{},
6924    //     },
6925    // },
6926    #[tokio::test]
6927    async fn binary_op_column_column() {
6928        let prom_expr =
6929            parser::parse(r#"some_metric{tag_0="foo"} + some_metric{tag_0="bar"}"#).unwrap();
6930        let eval_stmt = EvalStmt {
6931            expr: prom_expr,
6932            start: UNIX_EPOCH,
6933            end: UNIX_EPOCH
6934                .checked_add(Duration::from_secs(100_000))
6935                .unwrap(),
6936            interval: Duration::from_secs(5),
6937            lookback_delta: Duration::from_secs(1),
6938        };
6939
6940        let table_provider = build_test_table_provider(
6941            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
6942            1,
6943            1,
6944        )
6945        .await;
6946        let plan =
6947            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6948                .await
6949                .unwrap();
6950
6951        let expected = String::from(
6952            "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
6953            \n  Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6954            \n    SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6955            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6956            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6957            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6958            \n            Filter: some_metric.tag_0 = Utf8(\"foo\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6959            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6960            \n    SubqueryAlias: rhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6961            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6962            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6963            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6964            \n            Filter: some_metric.tag_0 = Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
6965            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
6966        );
6967
6968        assert_eq!(plan.display_indent_schema().to_string(), expected);
6969    }
6970
6971    async fn indie_query_plan_compare<T: AsRef<str>>(query: &str, expected: T) {
6972        let prom_expr = parser::parse(query).unwrap();
6973        let eval_stmt = EvalStmt {
6974            expr: prom_expr,
6975            start: UNIX_EPOCH,
6976            end: UNIX_EPOCH
6977                .checked_add(Duration::from_secs(100_000))
6978                .unwrap(),
6979            interval: Duration::from_secs(5),
6980            lookback_delta: Duration::from_secs(1),
6981        };
6982
6983        let table_provider = build_test_table_provider(
6984            &[
6985                (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
6986                (
6987                    "greptime_private".to_string(),
6988                    "some_alt_metric".to_string(),
6989                ),
6990            ],
6991            1,
6992            1,
6993        )
6994        .await;
6995        let plan =
6996            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
6997                .await
6998                .unwrap();
6999
7000        assert_eq!(plan.display_indent_schema().to_string(), expected.as_ref());
7001    }
7002
7003    #[tokio::test]
7004    async fn binary_op_literal_column() {
7005        let query = r#"1 + some_metric{tag_0="bar"}"#;
7006        let expected = String::from(
7007            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
7008            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7009            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7010            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7011            \n        Filter: some_metric.tag_0 = Utf8(\"bar\") AND some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7012            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7013        );
7014
7015        indie_query_plan_compare(query, expected).await;
7016    }
7017
7018    #[tokio::test]
7019    async fn binary_op_literal_literal() {
7020        let query = r#"1 + 1"#;
7021        let expected = r#"EmptyMetric: range=[0..100000000], interval=[5000] [time:Timestamp(ms), value:Float64;N]
7022  TableScan: dummy [time:Timestamp(ms), value:Float64;N]"#;
7023        indie_query_plan_compare(query, expected).await;
7024    }
7025
7026    #[tokio::test]
7027    async fn simple_bool_grammar() {
7028        let query = "some_metric != bool 1.2345";
7029        let expected = String::from(
7030            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 != Float64(1.2345) AS Float64) AS field_0 != Float64(1.2345) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 != Float64(1.2345):Float64;N]\
7031            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7032            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7033            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7034            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7035            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7036        );
7037
7038        indie_query_plan_compare(query, expected).await;
7039    }
7040
7041    #[tokio::test]
7042    async fn bool_with_additional_arithmetic() {
7043        let query = "some_metric + (1 == bool 2)";
7044        let expected = String::from(
7045            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
7046            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7047            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7048            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7049            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7050            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7051        );
7052
7053        indie_query_plan_compare(query, expected).await;
7054    }
7055
7056    #[tokio::test]
7057    async fn simple_unary() {
7058        let query = "-some_metric";
7059        let expected = String::from(
7060            "Projection: some_metric.tag_0, some_metric.timestamp, (- some_metric.field_0) AS (- field_0) [tag_0:Utf8, timestamp:Timestamp(ms), (- field_0):Float64;N]\
7061            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7062            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7063            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7064            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7065            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7066        );
7067
7068        indie_query_plan_compare(query, expected).await;
7069    }
7070
7071    #[tokio::test]
7072    async fn increase_aggr() {
7073        let query = "increase(some_metric[5m])";
7074        let expected = String::from(
7075            "Filter: prom_increase(timestamp_range,field_0,timestamp,Int64(300000)) IS NOT NULL [timestamp:Timestamp(ms), prom_increase(timestamp_range,field_0,timestamp,Int64(300000)):Float64;N, tag_0:Utf8]\
7076            \n  Projection: some_metric.timestamp, prom_increase(timestamp_range, field_0, some_metric.timestamp, Int64(300000)) AS prom_increase(timestamp_range,field_0,timestamp,Int64(300000)), some_metric.tag_0 [timestamp:Timestamp(ms), prom_increase(timestamp_range,field_0,timestamp,Int64(300000)):Float64;N, tag_0:Utf8]\
7077            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[300000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
7078            \n      PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7079            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7080            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7081            \n            Filter: some_metric.timestamp >= TimestampMillisecond(-299999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7082            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7083        );
7084
7085        indie_query_plan_compare(query, expected).await;
7086    }
7087
7088    #[tokio::test]
7089    async fn less_filter_on_value() {
7090        let query = "some_metric < 1.2345";
7091        let expected = String::from(
7092            "Filter: some_metric.field_0 < Float64(1.2345) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7093            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7094            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7095            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7096            \n        Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7097            \n          TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7098        );
7099
7100        indie_query_plan_compare(query, expected).await;
7101    }
7102
7103    #[tokio::test]
7104    async fn count_over_time() {
7105        let query = "count_over_time(some_metric[5m])";
7106        let expected = String::from(
7107            "Filter: prom_count_over_time(timestamp_range,field_0) IS NOT NULL [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
7108            \n  Projection: some_metric.timestamp, prom_count_over_time(timestamp_range, field_0) AS prom_count_over_time(timestamp_range,field_0), some_metric.tag_0 [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
7109            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[300000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
7110            \n      PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7111            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7112            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7113            \n            Filter: some_metric.timestamp >= TimestampMillisecond(-299999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7114            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7115        );
7116
7117        indie_query_plan_compare(query, expected).await;
7118    }
7119
7120    /// The outer `PromRangeManipulate` from a subquery must be preceded by
7121    /// `Sort` + `PromSeriesDivide`.
7122    #[tokio::test]
7123    async fn count_over_time_subquery() {
7124        let query = "count_over_time(some_metric[10m:1m])";
7125        let expected = String::from(
7126            "Filter: prom_count_over_time(timestamp_range,field_0) IS NOT NULL [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
7127            \n  Projection: some_metric.timestamp, prom_count_over_time(timestamp_range, field_0) AS prom_count_over_time(timestamp_range,field_0), some_metric.tag_0 [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
7128            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[600000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
7129            \n      PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7130            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7131            \n          PromInstantManipulate: range=[-540000..100000000], lookback=[1000], interval=[60000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7132            \n            PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7133            \n              Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7134            \n                Filter: some_metric.timestamp >= TimestampMillisecond(-540999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7135            \n                  TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7136        );
7137        indie_query_plan_compare(query, expected).await;
7138    }
7139
7140    #[tokio::test]
7141    async fn test_hash_join() {
7142        let mut eval_stmt = EvalStmt {
7143            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7144            start: UNIX_EPOCH,
7145            end: UNIX_EPOCH
7146                .checked_add(Duration::from_secs(100_000))
7147                .unwrap(),
7148            interval: Duration::from_secs(5),
7149            lookback_delta: Duration::from_secs(1),
7150        };
7151
7152        let case = r#"http_server_requests_seconds_sum{uri="/accounts/login"} / ignoring(kubernetes_pod_name,kubernetes_namespace) http_server_requests_seconds_count{uri="/accounts/login"}"#;
7153
7154        let prom_expr = parser::parse(case).unwrap();
7155        eval_stmt.expr = prom_expr;
7156        let table_provider = build_test_table_provider_with_fields(
7157            &[
7158                (
7159                    DEFAULT_SCHEMA_NAME.to_string(),
7160                    "http_server_requests_seconds_sum".to_string(),
7161                ),
7162                (
7163                    DEFAULT_SCHEMA_NAME.to_string(),
7164                    "http_server_requests_seconds_count".to_string(),
7165                ),
7166            ],
7167            &["uri", "kubernetes_namespace", "kubernetes_pod_name"],
7168        )
7169        .await;
7170        // Should be ok
7171        let plan =
7172            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7173                .await
7174                .unwrap();
7175        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
7176            \n  Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\
7177            \n    SubqueryAlias: http_server_requests_seconds_sum\
7178            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
7179            \n        PromSeriesDivide: tags=[\"uri\", \"kubernetes_namespace\", \"kubernetes_pod_name\"]\
7180            \n          Sort: http_server_requests_seconds_sum.uri ASC NULLS FIRST, http_server_requests_seconds_sum.kubernetes_namespace ASC NULLS FIRST, http_server_requests_seconds_sum.kubernetes_pod_name ASC NULLS FIRST, http_server_requests_seconds_sum.greptime_timestamp ASC NULLS FIRST\
7181            \n            Filter: http_server_requests_seconds_sum.uri = Utf8(\"/accounts/login\") AND http_server_requests_seconds_sum.greptime_timestamp >= TimestampMillisecond(-999, None) AND http_server_requests_seconds_sum.greptime_timestamp <= TimestampMillisecond(100000000, None)\
7182            \n              TableScan: http_server_requests_seconds_sum\
7183            \n    SubqueryAlias: http_server_requests_seconds_count\
7184            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
7185            \n        PromSeriesDivide: tags=[\"uri\", \"kubernetes_namespace\", \"kubernetes_pod_name\"]\
7186            \n          Sort: http_server_requests_seconds_count.uri ASC NULLS FIRST, http_server_requests_seconds_count.kubernetes_namespace ASC NULLS FIRST, http_server_requests_seconds_count.kubernetes_pod_name ASC NULLS FIRST, http_server_requests_seconds_count.greptime_timestamp ASC NULLS FIRST\
7187            \n            Filter: http_server_requests_seconds_count.uri = Utf8(\"/accounts/login\") AND http_server_requests_seconds_count.greptime_timestamp >= TimestampMillisecond(-999, None) AND http_server_requests_seconds_count.greptime_timestamp <= TimestampMillisecond(100000000, None)\
7188            \n              TableScan: http_server_requests_seconds_count";
7189        assert_eq!(plan.to_string(), expected);
7190    }
7191
7192    #[tokio::test]
7193    async fn test_nested_histogram_quantile() {
7194        let mut eval_stmt = EvalStmt {
7195            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7196            start: UNIX_EPOCH,
7197            end: UNIX_EPOCH
7198                .checked_add(Duration::from_secs(100_000))
7199                .unwrap(),
7200            interval: Duration::from_secs(5),
7201            lookback_delta: Duration::from_secs(1),
7202        };
7203
7204        let case = r#"label_replace(histogram_quantile(0.99, sum by(pod, le, path, code) (rate(greptime_servers_grpc_requests_elapsed_bucket{container="frontend"}[1m0s]))), "pod_new", "$1", "pod", "greptimedb-frontend-[0-9a-z]*-(.*)")"#;
7205
7206        let prom_expr = parser::parse(case).unwrap();
7207        eval_stmt.expr = prom_expr;
7208        let table_provider = build_test_table_provider_with_fields(
7209            &[(
7210                DEFAULT_SCHEMA_NAME.to_string(),
7211                "greptime_servers_grpc_requests_elapsed_bucket".to_string(),
7212            )],
7213            &["pod", "le", "path", "code", "container"],
7214        )
7215        .await;
7216        // Should be ok
7217        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7218            .await
7219            .unwrap();
7220    }
7221
7222    #[tokio::test]
7223    async fn test_histogram_quantile_binary_op() {
7224        let mut eval_stmt = EvalStmt {
7225            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7226            start: UNIX_EPOCH,
7227            end: UNIX_EPOCH
7228                .checked_add(Duration::from_secs(100_000))
7229                .unwrap(),
7230            interval: Duration::from_secs(5),
7231            lookback_delta: Duration::from_secs(1),
7232        };
7233
7234        // Arithmetic applied to a histogram_quantile() result. Regression for #8144:
7235        // HistogramFold used to drop the input column qualifiers, so the binary-op
7236        // projection failed to resolve the qualified tag column.
7237        let case = r#"histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) + 0"#;
7238
7239        let prom_expr = parser::parse(case).unwrap();
7240        eval_stmt.expr = prom_expr;
7241        let table_provider = build_test_table_provider_with_fields(
7242            &[(
7243                DEFAULT_SCHEMA_NAME.to_string(),
7244                "http_request_duration_seconds_bucket".to_string(),
7245            )],
7246            &["pod", "le"],
7247        )
7248        .await;
7249        // Should plan without a "No field named ..." error.
7250        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7251            .await
7252            .unwrap();
7253    }
7254
7255    #[tokio::test]
7256    async fn test_parse_and_operator() {
7257        let mut eval_stmt = EvalStmt {
7258            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7259            start: UNIX_EPOCH,
7260            end: UNIX_EPOCH
7261                .checked_add(Duration::from_secs(100_000))
7262                .unwrap(),
7263            interval: Duration::from_secs(5),
7264            lookback_delta: Duration::from_secs(1),
7265        };
7266
7267        let cases = [
7268            r#"count (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} ) and (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} )) / (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_capacity_bytes{namespace=~".+"} )) >= (80 / 100)) or vector (0)"#,
7269            r#"count (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} ) unless (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes{namespace=~".+"} )) / (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_capacity_bytes{namespace=~".+"} )) >= (80 / 100)) or vector (0)"#,
7270        ];
7271
7272        for case in cases {
7273            let prom_expr = parser::parse(case).unwrap();
7274            eval_stmt.expr = prom_expr;
7275            let table_provider = build_test_table_provider_with_fields(
7276                &[
7277                    (
7278                        DEFAULT_SCHEMA_NAME.to_string(),
7279                        "kubelet_volume_stats_used_bytes".to_string(),
7280                    ),
7281                    (
7282                        DEFAULT_SCHEMA_NAME.to_string(),
7283                        "kubelet_volume_stats_capacity_bytes".to_string(),
7284                    ),
7285                ],
7286                &["namespace", "persistentvolumeclaim"],
7287            )
7288            .await;
7289            // Should be ok
7290            let _ =
7291                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7292                    .await
7293                    .unwrap();
7294        }
7295    }
7296
7297    #[tokio::test]
7298    async fn test_nested_binary_op() {
7299        let mut eval_stmt = EvalStmt {
7300            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7301            start: UNIX_EPOCH,
7302            end: UNIX_EPOCH
7303                .checked_add(Duration::from_secs(100_000))
7304                .unwrap(),
7305            interval: Duration::from_secs(5),
7306            lookback_delta: Duration::from_secs(1),
7307        };
7308
7309        let case = r#"sum(rate(nginx_ingress_controller_requests{job=~".*"}[2m])) -
7310        (
7311            sum(rate(nginx_ingress_controller_requests{namespace=~".*"}[2m]))
7312            or
7313            vector(0)
7314        )"#;
7315
7316        let prom_expr = parser::parse(case).unwrap();
7317        eval_stmt.expr = prom_expr;
7318        let table_provider = build_test_table_provider_with_fields(
7319            &[(
7320                DEFAULT_SCHEMA_NAME.to_string(),
7321                "nginx_ingress_controller_requests".to_string(),
7322            )],
7323            &["namespace", "job"],
7324        )
7325        .await;
7326        // Should be ok
7327        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7328            .await
7329            .unwrap();
7330    }
7331
7332    #[tokio::test]
7333    async fn test_parse_or_operator() {
7334        let mut eval_stmt = EvalStmt {
7335            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7336            start: UNIX_EPOCH,
7337            end: UNIX_EPOCH
7338                .checked_add(Duration::from_secs(100_000))
7339                .unwrap(),
7340            interval: Duration::from_secs(5),
7341            lookback_delta: Duration::from_secs(1),
7342        };
7343
7344        let case = r#"
7345        sum(rate(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}[120s])) by (cluster_name,tenant_name) /
7346        (sum(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}) by (cluster_name,tenant_name) * 100)
7347            or
7348        200 * sum(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}) by (cluster_name,tenant_name) /
7349        sum(sysstat{tenant_name=~"tenant1",cluster_name=~"cluster1"}) by (cluster_name,tenant_name)"#;
7350
7351        let table_provider = build_test_table_provider_with_fields(
7352            &[(DEFAULT_SCHEMA_NAME.to_string(), "sysstat".to_string())],
7353            &["tenant_name", "cluster_name"],
7354        )
7355        .await;
7356        eval_stmt.expr = parser::parse(case).unwrap();
7357        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7358            .await
7359            .unwrap();
7360
7361        let case = r#"sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
7362            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) +
7363            sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
7364            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) >= 0
7365            or
7366            sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
7367            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) >= 0
7368            or
7369            sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) /
7370            (sum(delta(sysstat{tenant_name=~"sys",cluster_name=~"cluster1"}[2m])/120) by (cluster_name,tenant_name) *1000) >= 0"#;
7371        let table_provider = build_test_table_provider_with_fields(
7372            &[(DEFAULT_SCHEMA_NAME.to_string(), "sysstat".to_string())],
7373            &["tenant_name", "cluster_name"],
7374        )
7375        .await;
7376        eval_stmt.expr = parser::parse(case).unwrap();
7377        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7378            .await
7379            .unwrap();
7380
7381        let case = r#"(sum(background_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name) +
7382            sum(foreground_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name)) or
7383            (sum(background_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name)) or
7384            (sum(foreground_waitevent_cnt{tenant_name=~"sys",cluster_name=~"cluster1"}) by (cluster_name,tenant_name))"#;
7385        let table_provider = build_test_table_provider_with_fields(
7386            &[
7387                (
7388                    DEFAULT_SCHEMA_NAME.to_string(),
7389                    "background_waitevent_cnt".to_string(),
7390                ),
7391                (
7392                    DEFAULT_SCHEMA_NAME.to_string(),
7393                    "foreground_waitevent_cnt".to_string(),
7394                ),
7395            ],
7396            &["tenant_name", "cluster_name"],
7397        )
7398        .await;
7399        eval_stmt.expr = parser::parse(case).unwrap();
7400        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7401            .await
7402            .unwrap();
7403
7404        let case = r#"avg(node_load1{cluster_name=~"cluster1"}) by (cluster_name,host_name) or max(container_cpu_load_average_10s{cluster_name=~"cluster1"}) by (cluster_name,host_name) * 100 / max(container_spec_cpu_quota{cluster_name=~"cluster1"}) by (cluster_name,host_name)"#;
7405        let table_provider = build_test_table_provider_with_fields(
7406            &[
7407                (DEFAULT_SCHEMA_NAME.to_string(), "node_load1".to_string()),
7408                (
7409                    DEFAULT_SCHEMA_NAME.to_string(),
7410                    "container_cpu_load_average_10s".to_string(),
7411                ),
7412                (
7413                    DEFAULT_SCHEMA_NAME.to_string(),
7414                    "container_spec_cpu_quota".to_string(),
7415                ),
7416            ],
7417            &["cluster_name", "host_name"],
7418        )
7419        .await;
7420        eval_stmt.expr = parser::parse(case).unwrap();
7421        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7422            .await
7423            .unwrap();
7424    }
7425
7426    #[tokio::test]
7427    async fn value_matcher() {
7428        // template
7429        let mut eval_stmt = EvalStmt {
7430            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7431            start: UNIX_EPOCH,
7432            end: UNIX_EPOCH
7433                .checked_add(Duration::from_secs(100_000))
7434                .unwrap(),
7435            interval: Duration::from_secs(5),
7436            lookback_delta: Duration::from_secs(1),
7437        };
7438
7439        let cases = [
7440            // single equal matcher
7441            (
7442                r#"some_metric{__field__="field_1"}"#,
7443                vec![
7444                    "some_metric.field_1",
7445                    "some_metric.tag_0",
7446                    "some_metric.tag_1",
7447                    "some_metric.tag_2",
7448                    "some_metric.timestamp",
7449                ],
7450            ),
7451            // two equal matchers
7452            (
7453                r#"some_metric{__field__="field_1", __field__="field_0"}"#,
7454                vec![
7455                    "some_metric.field_0",
7456                    "some_metric.field_1",
7457                    "some_metric.tag_0",
7458                    "some_metric.tag_1",
7459                    "some_metric.tag_2",
7460                    "some_metric.timestamp",
7461                ],
7462            ),
7463            // single not_eq matcher
7464            (
7465                r#"some_metric{__field__!="field_1"}"#,
7466                vec![
7467                    "some_metric.field_0",
7468                    "some_metric.field_2",
7469                    "some_metric.tag_0",
7470                    "some_metric.tag_1",
7471                    "some_metric.tag_2",
7472                    "some_metric.timestamp",
7473                ],
7474            ),
7475            // two not_eq matchers
7476            (
7477                r#"some_metric{__field__!="field_1", __field__!="field_2"}"#,
7478                vec![
7479                    "some_metric.field_0",
7480                    "some_metric.tag_0",
7481                    "some_metric.tag_1",
7482                    "some_metric.tag_2",
7483                    "some_metric.timestamp",
7484                ],
7485            ),
7486            // equal and not_eq matchers (no conflict)
7487            (
7488                r#"some_metric{__field__="field_1", __field__!="field_0"}"#,
7489                vec![
7490                    "some_metric.field_1",
7491                    "some_metric.tag_0",
7492                    "some_metric.tag_1",
7493                    "some_metric.tag_2",
7494                    "some_metric.timestamp",
7495                ],
7496            ),
7497            // equal and not_eq matchers (conflict)
7498            (
7499                r#"some_metric{__field__="field_2", __field__!="field_2"}"#,
7500                vec![
7501                    "some_metric.tag_0",
7502                    "some_metric.tag_1",
7503                    "some_metric.tag_2",
7504                    "some_metric.timestamp",
7505                ],
7506            ),
7507            // single regex eq matcher
7508            (
7509                r#"some_metric{__field__=~"field_1|field_2"}"#,
7510                vec![
7511                    "some_metric.field_1",
7512                    "some_metric.field_2",
7513                    "some_metric.tag_0",
7514                    "some_metric.tag_1",
7515                    "some_metric.tag_2",
7516                    "some_metric.timestamp",
7517                ],
7518            ),
7519            // single regex not_eq matcher
7520            (
7521                r#"some_metric{__field__!~"field_1|field_2"}"#,
7522                vec![
7523                    "some_metric.field_0",
7524                    "some_metric.tag_0",
7525                    "some_metric.tag_1",
7526                    "some_metric.tag_2",
7527                    "some_metric.timestamp",
7528                ],
7529            ),
7530        ];
7531
7532        for case in cases {
7533            let prom_expr = parser::parse(case.0).unwrap();
7534            eval_stmt.expr = prom_expr;
7535            let table_provider = build_test_table_provider(
7536                &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
7537                3,
7538                3,
7539            )
7540            .await;
7541            let plan =
7542                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7543                    .await
7544                    .unwrap();
7545            let mut fields = plan.schema().field_names();
7546            let mut expected = case.1.into_iter().map(String::from).collect::<Vec<_>>();
7547            fields.sort();
7548            expected.sort();
7549            assert_eq!(fields, expected, "case: {:?}", case.0);
7550        }
7551
7552        let bad_cases = [
7553            r#"some_metric{__field__="nonexistent"}"#,
7554            r#"some_metric{__field__!="nonexistent"}"#,
7555        ];
7556
7557        for case in bad_cases {
7558            let prom_expr = parser::parse(case).unwrap();
7559            eval_stmt.expr = prom_expr;
7560            let table_provider = build_test_table_provider(
7561                &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
7562                3,
7563                3,
7564            )
7565            .await;
7566            let plan =
7567                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7568                    .await;
7569            assert!(plan.is_err(), "case: {:?}", case);
7570        }
7571    }
7572
7573    #[tokio::test]
7574    async fn custom_schema() {
7575        let query = "some_alt_metric{__schema__=\"greptime_private\"}";
7576        let expected = String::from(
7577            "PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7578            \n  PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7579            \n    Sort: greptime_private.some_alt_metric.tag_0 ASC NULLS FIRST, greptime_private.some_alt_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7580            \n      Filter: greptime_private.some_alt_metric.timestamp >= TimestampMillisecond(-999, None) AND greptime_private.some_alt_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7581            \n        TableScan: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7582        );
7583
7584        indie_query_plan_compare(query, expected).await;
7585
7586        let query = "some_alt_metric{__database__=\"greptime_private\"}";
7587        let expected = String::from(
7588            "PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7589            \n  PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7590            \n    Sort: greptime_private.some_alt_metric.tag_0 ASC NULLS FIRST, greptime_private.some_alt_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7591            \n      Filter: greptime_private.some_alt_metric.timestamp >= TimestampMillisecond(-999, None) AND greptime_private.some_alt_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7592            \n        TableScan: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7593        );
7594
7595        indie_query_plan_compare(query, expected).await;
7596
7597        let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric";
7598        let expected = String::from(
7599            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
7600            \n  Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7601            \n    SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7602            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7603            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7604            \n          Sort: greptime_private.some_alt_metric.tag_0 ASC NULLS FIRST, greptime_private.some_alt_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7605            \n            Filter: greptime_private.some_alt_metric.timestamp >= TimestampMillisecond(-999, None) AND greptime_private.some_alt_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7606            \n              TableScan: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7607            \n    SubqueryAlias: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7608            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7609            \n        PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7610            \n          Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7611            \n            Filter: some_metric.timestamp >= TimestampMillisecond(-999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
7612            \n              TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
7613        );
7614
7615        indie_query_plan_compare(query, expected).await;
7616    }
7617
7618    #[tokio::test]
7619    async fn only_equals_is_supported_for_special_matcher() {
7620        let queries = &[
7621            "some_alt_metric{__schema__!=\"greptime_private\"}",
7622            "some_alt_metric{__schema__=~\"lalala\"}",
7623            "some_alt_metric{__database__!=\"greptime_private\"}",
7624            "some_alt_metric{__database__=~\"lalala\"}",
7625        ];
7626
7627        for query in queries {
7628            let prom_expr = parser::parse(query).unwrap();
7629            let eval_stmt = EvalStmt {
7630                expr: prom_expr,
7631                start: UNIX_EPOCH,
7632                end: UNIX_EPOCH
7633                    .checked_add(Duration::from_secs(100_000))
7634                    .unwrap(),
7635                interval: Duration::from_secs(5),
7636                lookback_delta: Duration::from_secs(1),
7637            };
7638
7639            let table_provider = build_test_table_provider(
7640                &[
7641                    (DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string()),
7642                    (
7643                        "greptime_private".to_string(),
7644                        "some_alt_metric".to_string(),
7645                    ),
7646                ],
7647                1,
7648                1,
7649            )
7650            .await;
7651
7652            let plan =
7653                PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7654                    .await;
7655            assert!(plan.is_err(), "query: {:?}", query);
7656        }
7657    }
7658
7659    #[tokio::test]
7660    async fn test_non_ms_precision() {
7661        let catalog_list = MemoryCatalogManager::with_default_setup();
7662        let columns = vec![
7663            ColumnSchema::new(
7664                "tag".to_string(),
7665                ConcreteDataType::string_datatype(),
7666                false,
7667            ),
7668            ColumnSchema::new(
7669                "timestamp".to_string(),
7670                ConcreteDataType::timestamp_nanosecond_datatype(),
7671                false,
7672            )
7673            .with_time_index(true),
7674            ColumnSchema::new(
7675                "field".to_string(),
7676                ConcreteDataType::float64_datatype(),
7677                true,
7678            ),
7679        ];
7680        let schema = Arc::new(Schema::new(columns));
7681        let table_meta = TableMetaBuilder::empty()
7682            .schema(schema)
7683            .primary_key_indices(vec![0])
7684            .value_indices(vec![2])
7685            .next_column_id(1024)
7686            .build()
7687            .unwrap();
7688        let table_info = TableInfoBuilder::default()
7689            .name("metrics".to_string())
7690            .meta(table_meta)
7691            .build()
7692            .unwrap();
7693        let table = EmptyTable::from_table_info(&table_info);
7694        assert!(
7695            catalog_list
7696                .register_table_sync(RegisterTableRequest {
7697                    catalog: DEFAULT_CATALOG_NAME.to_string(),
7698                    schema: DEFAULT_SCHEMA_NAME.to_string(),
7699                    table_name: "metrics".to_string(),
7700                    table_id: 1024,
7701                    table,
7702                })
7703                .is_ok()
7704        );
7705
7706        let plan = PromPlanner::stmt_to_plan(
7707            DfTableSourceProvider::new(
7708                catalog_list.clone(),
7709                false,
7710                QueryContext::arc(),
7711                DummyDecoder::arc(),
7712                true,
7713            ),
7714            &EvalStmt {
7715                expr: parser::parse("metrics{tag = \"1\"}").unwrap(),
7716                start: UNIX_EPOCH,
7717                end: UNIX_EPOCH
7718                    .checked_add(Duration::from_secs(100_000))
7719                    .unwrap(),
7720                interval: Duration::from_secs(5),
7721                lookback_delta: Duration::from_secs(1),
7722            },
7723            &build_query_engine_state(),
7724        )
7725        .await
7726        .unwrap();
7727        assert_eq!(
7728            plan.display_indent_schema().to_string(),
7729            "PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7730            \n  PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7731            \n    Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7732            \n      Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-999, None) AND metrics.timestamp <= TimestampMillisecond(100000000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7733            \n        Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7734            \n          TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
7735        );
7736        let plan = PromPlanner::stmt_to_plan(
7737            DfTableSourceProvider::new(
7738                catalog_list.clone(),
7739                false,
7740                QueryContext::arc(),
7741                DummyDecoder::arc(),
7742                true,
7743            ),
7744            &EvalStmt {
7745                expr: parser::parse("avg_over_time(metrics{tag = \"1\"}[5s])").unwrap(),
7746                start: UNIX_EPOCH,
7747                end: UNIX_EPOCH
7748                    .checked_add(Duration::from_secs(100_000))
7749                    .unwrap(),
7750                interval: Duration::from_secs(5),
7751                lookback_delta: Duration::from_secs(1),
7752            },
7753            &build_query_engine_state(),
7754        )
7755        .await
7756        .unwrap();
7757        assert_eq!(
7758            plan.display_indent_schema().to_string(),
7759            "Filter: prom_avg_over_time(timestamp_range,field) IS NOT NULL [timestamp:Timestamp(ms), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
7760            \n  Projection: metrics.timestamp, prom_avg_over_time(timestamp_range, field) AS prom_avg_over_time(timestamp_range,field), metrics.tag [timestamp:Timestamp(ms), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
7761            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[5000], time index=[timestamp], values=[\"field\"] [field:Dictionary(Int64, Float64);N, tag:Utf8, timestamp:Timestamp(ms), timestamp_range:Dictionary(Int64, Timestamp(ms))]\
7762            \n      PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7763            \n        PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7764            \n          Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7765            \n            Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-4999, None) AND metrics.timestamp <= TimestampMillisecond(100000000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7766            \n              Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
7767            \n                TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
7768        );
7769    }
7770
7771    #[tokio::test]
7772    async fn test_nonexistent_label() {
7773        // template
7774        let mut eval_stmt = EvalStmt {
7775            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7776            start: UNIX_EPOCH,
7777            end: UNIX_EPOCH
7778                .checked_add(Duration::from_secs(100_000))
7779                .unwrap(),
7780            interval: Duration::from_secs(5),
7781            lookback_delta: Duration::from_secs(1),
7782        };
7783
7784        let case = r#"some_metric{nonexistent="hi"}"#;
7785        let prom_expr = parser::parse(case).unwrap();
7786        eval_stmt.expr = prom_expr;
7787        let table_provider = build_test_table_provider(
7788            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
7789            3,
7790            3,
7791        )
7792        .await;
7793        // Should be ok
7794        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7795            .await
7796            .unwrap();
7797    }
7798
7799    #[tokio::test]
7800    async fn test_label_join() {
7801        let prom_expr = parser::parse(
7802            "label_join(up{tag_0='api-server'}, 'foo', ',', 'tag_1', 'tag_2', 'tag_3')",
7803        )
7804        .unwrap();
7805        let eval_stmt = EvalStmt {
7806            expr: prom_expr,
7807            start: UNIX_EPOCH,
7808            end: UNIX_EPOCH
7809                .checked_add(Duration::from_secs(100_000))
7810                .unwrap(),
7811            interval: Duration::from_secs(5),
7812            lookback_delta: Duration::from_secs(1),
7813        };
7814
7815        let table_provider =
7816            build_test_table_provider(&[(DEFAULT_SCHEMA_NAME.to_string(), "up".to_string())], 4, 1)
7817                .await;
7818        let plan =
7819            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7820                .await
7821                .unwrap();
7822
7823        let expected = r#"
7824Filter: up.field_0 IS NOT NULL [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8]
7825  Projection: up.timestamp, up.field_0, concat_ws(Utf8(","), up.tag_1, up.tag_2, up.tag_3) AS foo, up.tag_0, up.tag_1, up.tag_2, up.tag_3 [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8]
7826    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7827      PromSeriesDivide: tags=["tag_0", "tag_1", "tag_2", "tag_3"] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7828        Sort: up.tag_0 ASC NULLS FIRST, up.tag_1 ASC NULLS FIRST, up.tag_2 ASC NULLS FIRST, up.tag_3 ASC NULLS FIRST, up.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7829          Filter: up.tag_0 = Utf8("api-server") AND up.timestamp >= TimestampMillisecond(-999, None) AND up.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7830            TableScan: up [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]"#;
7831
7832        let ret = plan.display_indent_schema().to_string();
7833        assert_eq!(format!("\n{ret}"), expected, "\n{}", ret);
7834    }
7835
7836    #[tokio::test]
7837    async fn test_label_replace() {
7838        let prom_expr = parser::parse(
7839            "label_replace(up{tag_0=\"a:c\"}, \"foo\", \"$1\", \"tag_0\", \"(.*):.*\")",
7840        )
7841        .unwrap();
7842        let eval_stmt = EvalStmt {
7843            expr: prom_expr,
7844            start: UNIX_EPOCH,
7845            end: UNIX_EPOCH
7846                .checked_add(Duration::from_secs(100_000))
7847                .unwrap(),
7848            interval: Duration::from_secs(5),
7849            lookback_delta: Duration::from_secs(1),
7850        };
7851
7852        let table_provider =
7853            build_test_table_provider(&[(DEFAULT_SCHEMA_NAME.to_string(), "up".to_string())], 1, 1)
7854                .await;
7855        let plan =
7856            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7857                .await
7858                .unwrap();
7859
7860        let expected = r#"
7861Filter: up.field_0 IS NOT NULL [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8]
7862  Projection: up.timestamp, up.field_0, regexp_replace(up.tag_0, Utf8("^(?s:(.*):.*)$"), Utf8("$1")) AS foo, up.tag_0 [timestamp:Timestamp(ms), field_0:Float64;N, foo:Utf8;N, tag_0:Utf8]
7863    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7864      PromSeriesDivide: tags=["tag_0"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7865        Sort: up.tag_0 ASC NULLS FIRST, up.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7866          Filter: up.tag_0 = Utf8("a:c") AND up.timestamp >= TimestampMillisecond(-999, None) AND up.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]
7867            TableScan: up [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]"#;
7868
7869        let ret = plan.display_indent_schema().to_string();
7870        assert_eq!(format!("\n{ret}"), expected, "\n{}", ret);
7871    }
7872
7873    #[tokio::test]
7874    async fn test_matchers_to_expr() {
7875        let mut eval_stmt = EvalStmt {
7876            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7877            start: UNIX_EPOCH,
7878            end: UNIX_EPOCH
7879                .checked_add(Duration::from_secs(100_000))
7880                .unwrap(),
7881            interval: Duration::from_secs(5),
7882            lookback_delta: Duration::from_secs(1),
7883        };
7884        let case =
7885            r#"sum(prometheus_tsdb_head_series{tag_1=~"(10.0.160.237:8080|10.0.160.237:9090)"})"#;
7886
7887        let prom_expr = parser::parse(case).unwrap();
7888        eval_stmt.expr = prom_expr;
7889        let table_provider = build_test_table_provider(
7890            &[(
7891                DEFAULT_SCHEMA_NAME.to_string(),
7892                "prometheus_tsdb_head_series".to_string(),
7893            )],
7894            3,
7895            3,
7896        )
7897        .await;
7898        let plan =
7899            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7900                .await
7901                .unwrap();
7902        let expected = "Sort: prometheus_tsdb_head_series.timestamp ASC NULLS LAST [timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.field_0):Float64;N, sum(prometheus_tsdb_head_series.field_1):Float64;N, sum(prometheus_tsdb_head_series.field_2):Float64;N]\
7903        \n  Aggregate: groupBy=[[prometheus_tsdb_head_series.timestamp]], aggr=[[sum(prometheus_tsdb_head_series.field_0), sum(prometheus_tsdb_head_series.field_1), sum(prometheus_tsdb_head_series.field_2)]] [timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.field_0):Float64;N, sum(prometheus_tsdb_head_series.field_1):Float64;N, sum(prometheus_tsdb_head_series.field_2):Float64;N]\
7904        \n    PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
7905        \n      PromSeriesDivide: tags=[\"tag_0\", \"tag_1\", \"tag_2\"] [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
7906        \n        Sort: prometheus_tsdb_head_series.tag_0 ASC NULLS FIRST, prometheus_tsdb_head_series.tag_1 ASC NULLS FIRST, prometheus_tsdb_head_series.tag_2 ASC NULLS FIRST, prometheus_tsdb_head_series.timestamp ASC NULLS FIRST [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
7907        \n          Filter: prometheus_tsdb_head_series.tag_1 ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]\
7908        \n            TableScan: prometheus_tsdb_head_series [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N]";
7909        assert_eq!(plan.display_indent_schema().to_string(), expected);
7910    }
7911
7912    #[tokio::test]
7913    async fn test_topk_expr() {
7914        let mut eval_stmt = EvalStmt {
7915            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7916            start: UNIX_EPOCH,
7917            end: UNIX_EPOCH
7918                .checked_add(Duration::from_secs(100_000))
7919                .unwrap(),
7920            interval: Duration::from_secs(5),
7921            lookback_delta: Duration::from_secs(1),
7922        };
7923        let case = r#"topk(10, sum(prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip))"#;
7924
7925        let prom_expr = parser::parse(case).unwrap();
7926        eval_stmt.expr = prom_expr;
7927        let table_provider = build_test_table_provider_with_fields(
7928            &[
7929                (
7930                    DEFAULT_SCHEMA_NAME.to_string(),
7931                    "prometheus_tsdb_head_series".to_string(),
7932                ),
7933                (
7934                    DEFAULT_SCHEMA_NAME.to_string(),
7935                    "http_server_requests_seconds_count".to_string(),
7936                ),
7937            ],
7938            &["ip"],
7939        )
7940        .await;
7941
7942        let plan =
7943            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7944                .await
7945                .unwrap();
7946        let expected = "Projection: sum(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp [sum(prometheus_tsdb_head_series.greptime_value):Float64;N, ip:Utf8, greptime_timestamp:Timestamp(ms)]\
7947        \n  Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64]\
7948        \n    Filter: row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Float64(10) [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64]\
7949        \n      WindowAggr: windowExpr=[[row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N, row_number() PARTITION BY [prometheus_tsdb_head_series.greptime_timestamp] ORDER BY [sum(prometheus_tsdb_head_series.greptime_value) DESC NULLS FIRST, prometheus_tsdb_head_series.ip DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64]\
7950        \n        Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
7951        \n          Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
7952        \n            PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
7953        \n              PromSeriesDivide: tags=[\"ip\"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
7954        \n                Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
7955        \n                  Filter: prometheus_tsdb_head_series.ip ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
7956        \n                    TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]";
7957
7958        assert_eq!(plan.display_indent_schema().to_string(), expected);
7959    }
7960
7961    #[tokio::test]
7962    async fn test_count_values_expr() {
7963        let mut eval_stmt = EvalStmt {
7964            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
7965            start: UNIX_EPOCH,
7966            end: UNIX_EPOCH
7967                .checked_add(Duration::from_secs(100_000))
7968                .unwrap(),
7969            interval: Duration::from_secs(5),
7970            lookback_delta: Duration::from_secs(1),
7971        };
7972        let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip)"#;
7973
7974        let prom_expr = parser::parse(case).unwrap();
7975        eval_stmt.expr = prom_expr;
7976        let table_provider = build_test_table_provider_with_fields(
7977            &[
7978                (
7979                    DEFAULT_SCHEMA_NAME.to_string(),
7980                    "prometheus_tsdb_head_series".to_string(),
7981                ),
7982                (
7983                    DEFAULT_SCHEMA_NAME.to_string(),
7984                    "http_server_requests_seconds_count".to_string(),
7985                ),
7986            ],
7987            &["ip"],
7988        )
7989        .await;
7990
7991        let plan =
7992            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
7993                .await
7994                .unwrap();
7995        let expected = "Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N]\
7996        \n  Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]\
7997        \n    Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]\
7998        \n      Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]\
7999        \n        PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8000        \n          PromSeriesDivide: tags=[\"ip\"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8001        \n            Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8002        \n              Filter: prometheus_tsdb_head_series.ip ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8003        \n                TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]";
8004
8005        assert_eq!(plan.display_indent_schema().to_string(), expected);
8006    }
8007
8008    #[tokio::test]
8009    async fn test_value_alias() {
8010        let mut eval_stmt = EvalStmt {
8011            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
8012            start: UNIX_EPOCH,
8013            end: UNIX_EPOCH
8014                .checked_add(Duration::from_secs(100_000))
8015                .unwrap(),
8016            interval: Duration::from_secs(5),
8017            lookback_delta: Duration::from_secs(1),
8018        };
8019        let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip)"#;
8020
8021        let prom_expr = parser::parse(case).unwrap();
8022        eval_stmt.expr = prom_expr;
8023        eval_stmt = QueryLanguageParser::apply_alias_extension(eval_stmt, "my_series");
8024        let table_provider = build_test_table_provider_with_fields(
8025            &[
8026                (
8027                    DEFAULT_SCHEMA_NAME.to_string(),
8028                    "prometheus_tsdb_head_series".to_string(),
8029                ),
8030                (
8031                    DEFAULT_SCHEMA_NAME.to_string(),
8032                    "http_server_requests_seconds_count".to_string(),
8033                ),
8034            ],
8035            &["ip"],
8036        )
8037        .await;
8038
8039        let plan =
8040            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
8041                .await
8042                .unwrap();
8043        let expected = r#"
8044Projection: count(prometheus_tsdb_head_series.greptime_value) AS my_series, prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp [my_series:Int64, ip:Utf8, greptime_timestamp:Timestamp(ms)]
8045  Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N]
8046    Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]
8047      Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(ms), series:Float64;N, greptime_value:Float64;N]
8048        Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]
8049          PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8050            PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8051              Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8052                Filter: prometheus_tsdb_head_series.ip ~ Utf8("^(?:(10.0.160.237:8080|10.0.160.237:9090))$") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8053                  TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]"#;
8054        assert_eq!(format!("\n{}", plan.display_indent_schema()), expected);
8055    }
8056
8057    #[tokio::test]
8058    async fn test_quantile_expr() {
8059        let mut eval_stmt = EvalStmt {
8060            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
8061            start: UNIX_EPOCH,
8062            end: UNIX_EPOCH
8063                .checked_add(Duration::from_secs(100_000))
8064                .unwrap(),
8065            interval: Duration::from_secs(5),
8066            lookback_delta: Duration::from_secs(1),
8067        };
8068        let case = r#"quantile(0.3, sum(prometheus_tsdb_head_series{ip=~"(10.0.160.237:8080|10.0.160.237:9090)"}) by (ip))"#;
8069
8070        let prom_expr = parser::parse(case).unwrap();
8071        eval_stmt.expr = prom_expr;
8072        let table_provider = build_test_table_provider_with_fields(
8073            &[
8074                (
8075                    DEFAULT_SCHEMA_NAME.to_string(),
8076                    "prometheus_tsdb_head_series".to_string(),
8077                ),
8078                (
8079                    DEFAULT_SCHEMA_NAME.to_string(),
8080                    "http_server_requests_seconds_count".to_string(),
8081                ),
8082            ],
8083            &["ip"],
8084        )
8085        .await;
8086
8087        let plan =
8088            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
8089                .await
8090                .unwrap();
8091        let expected = "Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [greptime_timestamp:Timestamp(ms), quantile(Float64(0.3),sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]\
8092        \n  Aggregate: groupBy=[[prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[quantile(Float64(0.3), sum(prometheus_tsdb_head_series.greptime_value))]] [greptime_timestamp:Timestamp(ms), quantile(Float64(0.3),sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]\
8093        \n    Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
8094        \n      Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(ms), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]\
8095        \n        PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8096        \n          PromSeriesDivide: tags=[\"ip\"] [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8097        \n            Sort: prometheus_tsdb_head_series.ip ASC NULLS FIRST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS FIRST [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8098        \n              Filter: prometheus_tsdb_head_series.ip ~ Utf8(\"^(?:(10.0.160.237:8080|10.0.160.237:9090))$\") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-999, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100000000, None) [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]\
8099        \n                TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]";
8100
8101        assert_eq!(plan.display_indent_schema().to_string(), expected);
8102    }
8103
8104    #[tokio::test]
8105    async fn test_or_not_exists_table_label() {
8106        let mut eval_stmt = EvalStmt {
8107            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
8108            start: UNIX_EPOCH,
8109            end: UNIX_EPOCH
8110                .checked_add(Duration::from_secs(100_000))
8111                .unwrap(),
8112            interval: Duration::from_secs(5),
8113            lookback_delta: Duration::from_secs(1),
8114        };
8115        let case = r#"sum by (job, tag0, tag2) (metric_exists) or sum by (job, tag0, tag2) (metric_not_exists)"#;
8116
8117        let prom_expr = parser::parse(case).unwrap();
8118        eval_stmt.expr = prom_expr;
8119        let table_provider = build_test_table_provider_with_fields(
8120            &[(DEFAULT_SCHEMA_NAME.to_string(), "metric_exists".to_string())],
8121            &["job"],
8122        )
8123        .await;
8124
8125        let plan =
8126            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
8127                .await
8128                .unwrap();
8129        let expected = r#"UnionDistinctOn: on col=[["job"]], ts_col=[greptime_timestamp] [greptime_timestamp:Timestamp(ms), job:Utf8, sum(metric_exists.greptime_value):Float64;N]
8130  SubqueryAlias: metric_exists [greptime_timestamp:Timestamp(ms), job:Utf8, sum(metric_exists.greptime_value):Float64;N]
8131    Projection: metric_exists.greptime_timestamp, metric_exists.job, sum(metric_exists.greptime_value) [greptime_timestamp:Timestamp(ms), job:Utf8, sum(metric_exists.greptime_value):Float64;N]
8132      Sort: metric_exists.job ASC NULLS LAST, metric_exists.greptime_timestamp ASC NULLS LAST [job:Utf8, greptime_timestamp:Timestamp(ms), sum(metric_exists.greptime_value):Float64;N]
8133        Aggregate: groupBy=[[metric_exists.job, metric_exists.greptime_timestamp]], aggr=[[sum(metric_exists.greptime_value)]] [job:Utf8, greptime_timestamp:Timestamp(ms), sum(metric_exists.greptime_value):Float64;N]
8134          PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8135            PromSeriesDivide: tags=["job"] [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8136              Sort: metric_exists.job ASC NULLS FIRST, metric_exists.greptime_timestamp ASC NULLS FIRST [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8137                Filter: metric_exists.greptime_timestamp >= TimestampMillisecond(-999, None) AND metric_exists.greptime_timestamp <= TimestampMillisecond(100000000, None) [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8138                  TableScan: metric_exists [job:Utf8, greptime_timestamp:Timestamp(ms), greptime_value:Float64;N]
8139  SubqueryAlias:  [greptime_timestamp:Timestamp(ms), job:Utf8;N, sum(.value):Float64;N]
8140    Projection: .time AS greptime_timestamp, Utf8(NULL) AS job, sum(.value) [greptime_timestamp:Timestamp(ms), job:Utf8;N, sum(.value):Float64;N]
8141      Sort: .time ASC NULLS LAST [time:Timestamp(ms), sum(.value):Float64;N]
8142        Aggregate: groupBy=[[.time]], aggr=[[sum(.value)]] [time:Timestamp(ms), sum(.value):Float64;N]
8143          EmptyMetric: range=[0..-1], interval=[5000] [time:Timestamp(ms), value:Float64;N]
8144            TableScan: dummy [time:Timestamp(ms), value:Float64;N]"#;
8145
8146        assert_eq!(plan.display_indent_schema().to_string(), expected);
8147    }
8148
8149    #[tokio::test]
8150    async fn test_histogram_quantile_missing_le_column() {
8151        let mut eval_stmt = EvalStmt {
8152            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
8153            start: UNIX_EPOCH,
8154            end: UNIX_EPOCH
8155                .checked_add(Duration::from_secs(100_000))
8156                .unwrap(),
8157            interval: Duration::from_secs(5),
8158            lookback_delta: Duration::from_secs(1),
8159        };
8160
8161        // Test case: histogram_quantile with a table that doesn't have 'le' column
8162        let case = r#"histogram_quantile(0.99, sum by(pod,instance,le) (rate(non_existent_histogram_bucket{instance=~"xxx"}[1m])))"#;
8163
8164        let prom_expr = parser::parse(case).unwrap();
8165        eval_stmt.expr = prom_expr;
8166
8167        // Create a table provider with a table that doesn't have 'le' column
8168        let table_provider = build_test_table_provider_with_fields(
8169            &[(
8170                DEFAULT_SCHEMA_NAME.to_string(),
8171                "non_existent_histogram_bucket".to_string(),
8172            )],
8173            &["pod", "instance"], // Note: no 'le' column
8174        )
8175        .await;
8176
8177        // Should return empty result instead of error
8178        let result =
8179            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
8180                .await;
8181
8182        // This should succeed now (returning empty result) instead of failing with "Cannot find column le"
8183        assert!(
8184            result.is_ok(),
8185            "Expected successful plan creation with empty result, but got error: {:?}",
8186            result.err()
8187        );
8188
8189        // Verify that the result is an EmptyRelation
8190        let plan = result.unwrap();
8191        match plan {
8192            LogicalPlan::EmptyRelation(_) => {
8193                // This is what we expect
8194            }
8195            _ => panic!("Expected EmptyRelation, but got: {:?}", plan),
8196        }
8197    }
8198}