From 07d05da95c7d3882a7032dade3cbeefbd96628b7 Mon Sep 17 00:00:00 2001 From: Toby Mao Date: Thu, 19 Dec 2024 09:37:43 -0800 Subject: [PATCH] fix!: normalize before qualifying tables (#4539) closes #4538 --- sqlglot/optimizer/qualify.py | 2 +- tests/fixtures/optimizer/optimizer.sql | 44 +++++++++++++------------- tests/test_optimizer.py | 29 +++++++++++++++++ 3 files changed, 52 insertions(+), 23 deletions(-) diff --git a/sqlglot/optimizer/qualify.py b/sqlglot/optimizer/qualify.py index 03ab6ba8f3..a35699d219 100644 --- a/sqlglot/optimizer/qualify.py +++ b/sqlglot/optimizer/qualify.py @@ -69,7 +69,6 @@ def qualify( The qualified expression. """ schema = ensure_schema(schema, dialect=dialect) - expression = normalize_identifiers(expression, dialect=dialect) expression = qualify_tables( expression, db=db, @@ -78,6 +77,7 @@ def qualify( dialect=dialect, infer_csv_schemas=infer_csv_schemas, ) + expression = normalize_identifiers(expression, dialect=dialect) if isolate_tables: expression = isolate_table_selects(expression, schema=schema) diff --git a/tests/fixtures/optimizer/optimizer.sql b/tests/fixtures/optimizer/optimizer.sql index 76fc16dd7d..ed9c06f5d3 100644 --- a/tests/fixtures/optimizer/optimizer.sql +++ b/tests/fixtures/optimizer/optimizer.sql @@ -630,11 +630,11 @@ PIVOT(SUM(`u_cte`.`f`) AS `sum` FOR `u_cte`.`h` IN ('x', 'y')) AS `_q_0`; # dialect: snowflake SELECT * FROM u PIVOT (SUM(f) FOR h IN ('x', 'y')); SELECT - "_q_0"."G" AS "G", - "_q_0"."'x'" AS "'x'", - "_q_0"."'y'" AS "'y'" + "_Q_0"."G" AS "G", + "_Q_0"."'x'" AS "'x'", + "_Q_0"."'y'" AS "'y'" FROM "U" AS "U" -PIVOT(SUM("U"."F") FOR "U"."H" IN ('x', 'y')) AS "_q_0"; +PIVOT(SUM("U"."F") FOR "U"."H" IN ('x', 'y')) AS "_Q_0"; # title: selecting all columns from a pivoted source and generating spark # note: spark doesn't allow pivot aliases or qualified columns for the pivot's "field" (`h`) @@ -690,14 +690,14 @@ PIVOT(MAX("SOURCE"."VALUE") FOR "SOURCE"."KEY" IN ('a', 'b', 'c')) AS "FINAL"("I # dialect: snowflake SELECT * FROM m_sales AS m_sales(empid, dept, jan, feb) UNPIVOT(sales FOR month IN (jan, feb)) ORDER BY empid; SELECT - "_q_0"."EMPID" AS "EMPID", - "_q_0"."DEPT" AS "DEPT", - "_q_0"."MONTH" AS "MONTH", - "_q_0"."SALES" AS "SALES" + "_Q_0"."EMPID" AS "EMPID", + "_Q_0"."DEPT" AS "DEPT", + "_Q_0"."MONTH" AS "MONTH", + "_Q_0"."SALES" AS "SALES" FROM "M_SALES" AS "M_SALES"("EMPID", "DEPT", "JAN", "FEB") -UNPIVOT("SALES" FOR "MONTH" IN ("JAN", "FEB")) AS "_q_0" +UNPIVOT("SALES" FOR "MONTH" IN ("JAN", "FEB")) AS "_Q_0" ORDER BY - "_q_0"."EMPID"; + "_Q_0"."EMPID"; # title: unpivoted table source, unpivot has column aliases # execute: false @@ -722,10 +722,10 @@ UNPIVOT("sales" FOR "month" IN ("m_sales"."jan", "m_sales"."feb")) AS "unpiv"("a # dialect: snowflake SELECT * FROM (SELECT * FROM m_sales) AS m_sales(empid, dept, jan, feb) UNPIVOT(sales FOR month IN (jan, feb)) ORDER BY empid; SELECT - "_q_0"."EMPID" AS "EMPID", - "_q_0"."DEPT" AS "DEPT", - "_q_0"."MONTH" AS "MONTH", - "_q_0"."SALES" AS "SALES" + "_Q_0"."EMPID" AS "EMPID", + "_Q_0"."DEPT" AS "DEPT", + "_Q_0"."MONTH" AS "MONTH", + "_Q_0"."SALES" AS "SALES" FROM ( SELECT "M_SALES"."EMPID" AS "EMPID", @@ -734,9 +734,9 @@ FROM ( "M_SALES"."FEB" AS "FEB" FROM "M_SALES" AS "M_SALES" ) AS "M_SALES" -UNPIVOT("SALES" FOR "MONTH" IN ("JAN", "FEB")) AS "_q_0" +UNPIVOT("SALES" FOR "MONTH" IN ("JAN", "FEB")) AS "_Q_0" ORDER BY - "_q_0"."EMPID"; + "_Q_0"."EMPID"; # title: unpivoted table source with a single value column, unpivot columns can be qualified # execute: false @@ -832,13 +832,13 @@ WHERE GROUP BY `dAy`, `top_term`, rank ORDER BY `DaY` DESC; SELECT - `TOp_TeRmS`.`refresh_date` AS `day`, - `TOp_TeRmS`.`term` AS `top_term`, - `TOp_TeRmS`.`rank` AS `rank` -FROM `bigquery-public-data.GooGle_tReNDs.TOp_TeRmS` AS `TOp_TeRmS` + `top_terms`.`refresh_date` AS `day`, + `top_terms`.`term` AS `top_term`, + `top_terms`.`rank` AS `rank` +FROM `bigquery-public-data.GooGle_tReNDs.TOp_TeRmS` AS `top_terms` WHERE - `TOp_TeRmS`.`rank` = 1 - AND `TOp_TeRmS`.`refresh_date` >= DATE_SUB(CURRENT_DATE, INTERVAL '2' WEEK) + `top_terms`.`rank` = 1 + AND `top_terms`.`refresh_date` >= DATE_SUB(CURRENT_DATE, INTERVAL '2' WEEK) GROUP BY `day`, `top_term`, diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 7f2ed0db51..fe4130e301 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -255,6 +255,35 @@ def test_normalize(self): @patch("sqlglot.generator.logger") def test_qualify_columns(self, logger): + self.assertEqual( + optimizer.qualify.qualify( + parse_one( + """ + SELECT Teams.Name, count(*) + FROM raw.TeamMemberships as TeamMemberships + join raw.Teams + on Teams.Id = TeamMemberships.TeamId + GROUP BY 1 + """, + read="bigquery", + ), + schema={ + "raw": { + "TeamMemberships": { + "Id": "INTEGER", + "UserId": "INTEGER", + "TeamId": "INTEGER", + }, + "Teams": { + "Id": "INTEGER", + "Name": "STRING", + }, + } + }, + dialect="bigquery", + ).sql(dialect="bigquery"), + "SELECT `teams`.`name` AS `name`, count(*) AS `_col_1` FROM `raw`.`TeamMemberships` AS `teammemberships` JOIN `raw`.`Teams` AS `teams` ON `teams`.`id` = `teammemberships`.`teamid` GROUP BY `teams`.`name`", + ) self.assertEqual( optimizer.qualify.qualify( parse_one(