CLD Pricing Outlier Bounds
Purpose​
This DAG creates outlier bounds for negotiated rate, list price, and cash price in the CLD data. The outlier bounds are used to identify and filter pricing anomalies during data processing.
DAG Variables​
{
"trino_conn_name": "trino_default",
"datestamp": "", // E.g. 2025_03_28, defaults to the `data_interval_end` date of the DAG run if not provided
"input_catalog": "tq_dev",
"input_schema": "internal_dev_csong_cld_v0_1",
"input_table": "prod_combined_all",
"output_catalog": "hive",
"output_schema": "cld_utils",
"output_table_negotiated_rate": "", // E.g. outlier_bounds_negotiated_rate_2025_03_28
"output_table_list_price": "", // E.g. outlier_bounds_list_price_2025_03_28
"output_table_cash_price": "", // E.g. outlier_bounds_cash_price_2025_03_28
"initial_rate_threshold": 100000000, // The threshold to initially filter rates to for excluding obvious outliers
"canonical_rate_score_threshold": 5 // Canonical Rate Score, see [CLD Scores](https://cld.turquoise.health/components/accuracy/scores) for details
}
SQL Queries​
create_outlier_bounds_negotiated_rate​
DROP TABLE IF EXISTS {{ params.output_catalog }}.{{ params.output_schema }}.{{ params.output_table }};
CREATE TABLE IF NOT EXISTS {{ params.output_catalog }}.{{ params.output_schema }}.{{ params.output_table }} AS
WITH
-- Provider-code pairs
provider_code_medicare_rates AS (
SELECT
provider_id,
cbsa,
CASE
WHEN provider_type LIKE '%Hospital%'
THEN provider_subtype
ELSE provider_type
END as provider_type,
bill_type,
facility,
is_drug_code,
billing_code,
billing_code_type,
any_value(state) as state,
-- depending on subversion, there may be +/- .0001 variance in cbsa avg medicare rate
AVG(cbsa_avg_medicare_rate) as cbsa_avg_medicare_rate
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE cbsa_avg_medicare_rate IS NOT NULL
GROUP BY 1,2,3,4,5,6,7,8
),
-- Compute cbsa average Medicare rate
cbsa_avg_medicare AS (
SELECT
CASE
WHEN cbsa = '99999'
THEN state
ELSE cbsa
END AS cbsa,
provider_type,
bill_type,
facility,
billing_code_type,
billing_code,
AVG(cbsa_avg_medicare_rate) as avg_medicare_rate
FROM provider_code_medicare_rates
GROUP BY 1,2,3,4,5,6
),
-- Pulls validated rates from previous run and add log transform
cld AS (
SELECT
*,
ln(canonical_rate) as log_canonical_rate
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE canonical_rate_score = {{ params.canonical_rate_score_threshold }}
AND canonical_rate > 0
),
-- Medicare reference ASP bounds
asp AS (
SELECT
hcpcs as code,
CAST(payment_limit AS DOUBLE) as asp_payment_limit
FROM tq_production.reference_internal.asp_reference_pricing
WHERE is_latest_start_effective_date = true
),
-- Calculate IQR for each billing code on the log scale
iqr AS (
SELECT
CASE
WHEN provider_type LIKE '%Hospital%'
THEN provider_subtype
ELSE provider_type
END as provider_type,
bill_type,
facility,
billing_code_type,
billing_code,
STDDEV(log_canonical_rate) as stddev,
SKEWNESS(log_canonical_rate) as skewness,
KURTOSIS(log_canonical_rate) as kurtosis,
APPROX_PERCENTILE(log_canonical_rate, 0.25) as q1,
APPROX_PERCENTILE(log_canonical_rate, 0.50) as median,
APPROX_PERCENTILE(log_canonical_rate, 0.75) as q3,
APPROX_PERCENTILE(log_canonical_rate, 0.85) as p85,
APPROX_PERCENTILE(log_canonical_rate, 0.90) as p90,
APPROX_PERCENTILE(log_canonical_rate, 0.95) as p95,
APPROX_PERCENTILE(log_canonical_rate, 0.99) as p99,
CAST(
APPROX_PERCENTILE(log_canonical_rate, 0.75)
- APPROX_PERCENTILE(log_canonical_rate, 0.25)
AS DECIMAL(8,5)
) as iqr,
count(distinct log_canonical_rate) as n
FROM cld
WHERE log_canonical_rate IS NOT NULL
GROUP BY 1, 2, 3, 4, 5
),
iqr_state AS (
SELECT
state,
CASE
WHEN provider_type LIKE '%Hospital%'
THEN provider_subtype
ELSE provider_type
END as provider_type,
bill_type,
facility,
billing_code_type,
billing_code,
STDDEV(log_canonical_rate) as stddev,
SKEWNESS(log_canonical_rate) as skewness,
KURTOSIS(log_canonical_rate) as kurtosis,
APPROX_PERCENTILE(log_canonical_rate, 0.50) as median,
count(distinct log_canonical_rate) as n
FROM cld
WHERE log_canonical_rate IS NOT NULL
GROUP BY 1, 2, 3, 4, 5, 6
),
-- Truncate IQR to 1
iqr_truncated AS (
SELECT
*,
CASE
WHEN iqr > 1 THEN 1
ELSE iqr
END as iqr_truncated
FROM iqr
),
-- Distinct counts from provider-codes data
distinct_counts AS (
SELECT
CASE
WHEN provider_type LIKE '%Hospital%'
THEN provider_subtype
ELSE provider_type
END as provider_type,
bill_type,
facility,
billing_code_type,
billing_code,
COUNT(DISTINCT payer_id) as n_distinct_payer,
COUNT(DISTINCT provider_id) as n_distinct_provider,
COUNT(DISTINCT cbsa) as n_distinct_cbsa
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE canonical_rate <= {{ params.initial_rate_threshold }}
GROUP BY 1, 2, 3, 4, 5
)
SELECT
pc.provider_id,
pc.provider_type,
pc.bill_type,
pc.facility,
pc.billing_code_type,
pc.billing_code,
pc.state as provider_state,
dc.n_distinct_payer,
dc.n_distinct_provider,
dc.n_distinct_cbsa,
asp.asp_payment_limit,
COALESCE(
pc.cbsa_avg_medicare_rate,
sam.avg_medicare_rate
) as avg_medicare_rate,
sam.avg_medicare_rate as cbsa_avg_medicare_rate,
exp(q1 - 2 * iqr_truncated) as iqr_lower_bound,
exp(q3 + 2 * iqr_truncated) as iqr_upper_bound,
CASE
WHEN iqr_state.n > 40 THEN iqr_state.stddev
ELSE iqr.stddev
END as stddev,
iqr.skewness,
iqr.kurtosis,
CASE
WHEN ABS(iqr.skewness) <= {{ params.skewness_threshold }}
AND iqr.kurtosis > {{ params.kurtosis_threshold }}
THEN True
ELSE False
END as use_validated_distribution_for_accuracy,
q1,
CASE
WHEN iqr_state.n > 40 THEN iqr_state.median
ELSE iqr.median
END as median,
q3,
p85,
p90,
p95,
p99,
iqr,
iqr_truncated,
CASE
WHEN pc.bill_type = 'Inpatient' AND sam.avg_medicare_rate IS NOT NULL
THEN sam.avg_medicare_rate * 0.9
WHEN is_drug_code = True
THEN COALESCE(asp.asp_payment_limit * 0.8, sam.avg_medicare_rate * 0.8)
WHEN COALESCE(iqr.n, 0) < 40
THEN COALESCE(pc.cbsa_avg_medicare_rate, sam.avg_medicare_rate) * 0.1
ELSE exp(q1 - 2 * iqr_truncated)
END as lower_bound,
CASE
WHEN is_drug_code = True
THEN COALESCE(asp.asp_payment_limit * 4, sam.avg_medicare_rate * 4)
WHEN COALESCE(iqr.n, 0) < 40
THEN COALESCE(pc.cbsa_avg_medicare_rate, sam.avg_medicare_rate) * 10
-- cap at 30x avg medicare
WHEN exp(q3 + 2 * iqr_truncated) > COALESCE(pc.cbsa_avg_medicare_rate, sam.avg_medicare_rate, 0) * 30
THEN COALESCE(pc.cbsa_avg_medicare_rate, sam.avg_medicare_rate, 0) * 30
ELSE exp(q3 + 2 * iqr_truncated)
END as upper_bound,
CASE
WHEN pc.bill_type = 'Inpatient' AND sam.avg_medicare_rate IS NOT NULL
THEN 'cbsa_avg_medicare_rate_90_pct'
WHEN is_drug_code = True
THEN 'asp'
WHEN COALESCE(iqr.n, 0) < 40
THEN 'medicare_10_pct'
ELSE 'log_iqr'
END as lower_bound_type,
CASE
WHEN is_drug_code = True
THEN 'asp'
WHEN COALESCE(iqr.n, 0) < 40
THEN 'cbsa_avg_medicare_rate_300_pct'
ELSE 'log_iqr'
END as upper_bound_type,
CASE
WHEN iqr_state.n > 40 THEN iqr_state.n
ELSE iqr.n
END as n
FROM provider_code_medicare_rates pc
LEFT JOIN asp
ON pc.bill_type = 'Outpatient'
AND pc.billing_code = asp.code
AND pc.billing_code_type = 'HCPCS'
LEFT JOIN iqr_truncated iqr
ON pc.bill_type = iqr.bill_type
AND pc.billing_code = iqr.billing_code
AND pc.billing_code_type = iqr.billing_code_type
AND pc.provider_type = iqr.provider_type
AND pc.facility IS NOT DISTINCT FROM iqr.facility
LEFT JOIN iqr_state
ON pc.bill_type = iqr_state.bill_type
AND pc.billing_code = iqr_state.billing_code
AND pc.billing_code_type = iqr_state.billing_code_type
AND pc.provider_type = iqr_state.provider_type
AND pc.facility IS NOT DISTINCT FROM iqr_state.facility
AND pc.state = iqr_state.state
LEFT JOIN distinct_counts dc
ON pc.bill_type = dc.bill_type
AND pc.billing_code = dc.billing_code
AND pc.billing_code_type = dc.billing_code_type
AND pc.provider_type = dc.provider_type
AND pc.facility IS NOT DISTINCT FROM dc.facility
LEFT JOIN cbsa_avg_medicare sam
ON pc.bill_type = sam.bill_type
AND pc.billing_code = sam.billing_code
AND pc.billing_code_type = sam.billing_code_type
AND pc.provider_type = sam.provider_type
AND pc.facility IS NOT DISTINCT FROM sam.facility
AND pc.cbsa = sam.cbsa
WHERE
asp.asp_payment_limit IS NOT NULL
OR pc.cbsa_avg_medicare_rate IS NOT NULL
OR COALESCE(iqr.n, 0) > 40
;
create_outlier_bounds_list_price​
DROP TABLE IF EXISTS {{ params.output_catalog }}.{{ params.output_schema }}.{{ params.output_table }};
CREATE TABLE IF NOT EXISTS {{ params.output_catalog }}.{{ params.output_schema }}.{{ params.output_table }} AS
WITH
-- Provider-code pairs
provider_code_list_price AS (
SELECT
DISTINCT
provider_id,
bill_type,
facility,
billing_code_type,
billing_code,
canonical_gross_charge as gross_charge
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE canonical_gross_charge BETWEEN 0.01 AND {{ params.initial_rate_threshold }}
),
-- Calculate average and IQR for each billing code on the log scale
iqr AS (
SELECT
bill_type,
facility,
billing_code_type,
billing_code,
AVG(gross_charge) as avg_gross_charge,
APPROX_PERCENTILE(ln(gross_charge), 0.25) as q1,
APPROX_PERCENTILE(ln(gross_charge), 0.75) as q3,
CAST(
APPROX_PERCENTILE(ln(gross_charge), 0.75)
- APPROX_PERCENTILE(ln(gross_charge), 0.25)
AS DECIMAL(8,5)
) as iqr,
count(*) as n
FROM provider_code_list_price
GROUP BY 1, 2, 3, 4
),
-- Truncate IQR to 1
iqr_truncated AS (
SELECT
*,
CASE
WHEN iqr > 1 THEN 1
ELSE iqr
END as iqr_truncated
FROM iqr
),
-- Distinct counts from provider-codes data
distinct_counts AS (
SELECT
bill_type,
facility,
billing_code_type,
billing_code,
COUNT(DISTINCT payer_id) AS n_distinct_payer,
COUNT(DISTINCT provider_id)AS n_distinct_provider,
COUNT(DISTINCT cbsa) AS n_distinct_cbsa
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE canonical_gross_charge BETWEEN 0.01 AND {{ params.initial_rate_threshold }}
GROUP BY 1,2,3,4
)
SELECT
pc.provider_id,
pc.bill_type,
pc.facility,
pc.billing_code_type,
pc.billing_code,
dc.n_distinct_payer,
dc.n_distinct_provider,
dc.n_distinct_cbsa,
iqr.avg_gross_charge,
COALESCE(pc.gross_charge, iqr.avg_gross_charge) as gross_charge,
q1,
q3,
iqr,
iqr_truncated,
exp(q1 - 2.5 * iqr_truncated) as lower_bound,
exp(q3 + 2.5 * iqr_truncated) as upper_bound,
CASE
WHEN q1 IS NULL or iqr_truncated IS NULL
THEN NULL
ELSE 'log_iqr'
END as lower_bound_type,
CASE
WHEN q3 IS NULL or iqr_truncated IS NULL
THEN NULL
ELSE 'log_iqr'
END as upper_bound_type,
n
FROM provider_code_list_price pc
LEFT JOIN iqr_truncated iqr
ON pc.bill_type = iqr.bill_type
AND pc.billing_code = iqr.billing_code
AND pc.billing_code_type = iqr.billing_code_type
AND COALESCE(pc.facility, true) = COALESCE(iqr.facility, true)
LEFT JOIN distinct_counts dc
ON pc.bill_type = dc.bill_type
AND pc.billing_code = dc.billing_code
AND pc.billing_code_type = dc.billing_code_type
AND COALESCE(pc.facility, true) = COALESCE(dc.facility, true)
WHERE
n > 40
;
create_outlier_bounds_cash_price​
DROP TABLE IF EXISTS {{ params.output_catalog }}.{{ params.output_schema }}.{{ params.output_table }};
CREATE TABLE IF NOT EXISTS {{ params.output_catalog }}.{{ params.output_schema }}.{{ params.output_table }} AS
WITH
-- Provider-code pairs
provider_code_cash_price AS (
SELECT
DISTINCT
provider_id,
bill_type,
facility,
billing_code_type,
billing_code,
discounted_cash_rate
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE discounted_cash_rate <= {{ params.initial_rate_threshold }}
),
-- Calculate average and IQR for each billing code on the log scale
iqr AS (
SELECT
bill_type,
facility,
billing_code_type,
billing_code,
AVG(discounted_cash_rate) as avg_discounted_cash_rate,
APPROX_PERCENTILE(ln(discounted_cash_rate), 0.25) as q1,
APPROX_PERCENTILE(ln(discounted_cash_rate), 0.75) as q3,
CAST(
APPROX_PERCENTILE(ln(discounted_cash_rate), 0.75)
- APPROX_PERCENTILE(ln(discounted_cash_rate), 0.25)
AS DECIMAL(8,5)
) as iqr,
count(*) as n
FROM provider_code_cash_price
GROUP BY 1, 2, 3, 4
),
-- Truncate IQR to 1
iqr_truncated AS (
SELECT
*,
CASE
WHEN iqr > 1 THEN 1
ELSE iqr
END as iqr_truncated
FROM iqr
),
-- Distinct counts from provider-codes data
distinct_counts AS (
SELECT
bill_type,
facility,
billing_code_type,
billing_code,
COUNT(DISTINCT payer_id) as n_distinct_payer,
COUNT(DISTINCT provider_id) as n_distinct_provider,
COUNT(DISTINCT cbsa) as n_distinct_cbsa
FROM {{ params.input_catalog }}.{{ params.input_schema }}.{{ params.input_table }}
WHERE discounted_cash_rate <= {{ params.initial_rate_threshold }}
GROUP BY 1, 2, 3, 4
)
SELECT
pc.provider_id,
pc.bill_type,
pc.facility,
pc.billing_code_type,
pc.billing_code,
dc.n_distinct_payer,
dc.n_distinct_provider,
dc.n_distinct_cbsa,
iqr.avg_discounted_cash_rate,
COALESCE(pc.discounted_cash_rate, iqr.avg_discounted_cash_rate) as discounted_cash_rate,
q1,
q3,
iqr,
iqr_truncated,
exp(q1 - 2.5 * iqr_truncated) as lower_bound,
exp(q3 + 2.5 * iqr_truncated) as upper_bound,
CASE
WHEN q1 IS NULL or iqr_truncated IS NULL
THEN NULL
ELSE 'log_iqr'
END as lower_bound_type,
CASE
WHEN q3 IS NULL or iqr_truncated IS NULL
THEN NULL
ELSE 'log_iqr'
END as upper_bound_type,
n
FROM provider_code_cash_price pc
LEFT JOIN iqr_truncated iqr
ON pc.bill_type = iqr.bill_type
AND pc.billing_code = iqr.billing_code
AND pc.billing_code_type = iqr.billing_code_type
AND COALESCE(pc.facility, true) = COALESCE(iqr.facility, true)
LEFT JOIN distinct_counts dc
ON pc.bill_type = dc.bill_type
AND pc.billing_code = dc.billing_code
AND pc.billing_code_type = dc.billing_code_type
AND COALESCE(pc.facility, true) = COALESCE(dc.facility, true)
WHERE
n > 40
;