from IPython import get_ipython
from jinja2 import Template
if get_ipython() is not None:
get_ipython().run_line_magic("load_ext", "autoreload")
get_ipython().run_line_magic("autoreload", "2")
from dotenv import load_dotenv
import pandas as pd
from tqds.sql import db, sql
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style='whitegrid', rc={'figure.figsize': (14, 12)})
pd.options.display.max_columns = None
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None
pd.set_option('display.float_format', lambda x: '%.3f' % x)
trino_conn = db.TrinoClient().conn
redshift_conn = db.RedshiftClient().conn
query = sql.Query()
"""
[CLD-v2.2.1] independence mappings / one-off
[CLD-v2.2.1] accuracy validated - select higher rate
[CLD-v2.2.1] prod combined abridged to always populate medicare fields
[CLD-v2.2.1] allowed amounts accuracy scoring
[CLD-v2.2.1] use provider sub type for hospitals in Clickhouse
[CLD-v2.2.1] Add NDC to Drug Code Crosswalk
[CLD-v2.2.1] validated medical services pick payer
[CLD-v2.2.1] benchmarks geo fix
[CLD-v2.2.1] rename wellmark bluecard from IA HMO to IA PPO
[CLD-v2.2.1] round rates to 2 decimals
[CLD-v2.2.1] Accuracy Macro Refactor
[CLD-v2.2.1] Plan Bridge Hierarchy for MO Anthem
[CLD-v2.2.1] use state level validated median where available else national
[CLD-v2.2.1] ROID algorithm update
[CLD-v2.2.1] provider_npi remove nulls from arrays
[CLD-v2.2.1] Provider Subtype and Provider Type
[CLD-v2.2.1] revert to official medicare reference tables
[CLD-v2.2.1] state name full (e.g. California (CA))
[CLD-v2.2.1] Physician Groups Lab Fee Schedule
[CLD-v2.2.1] Anesthesia Prioritized over Raw Rates
[CLD-v2.2.1] Premera AK
[CLD-v2.2.1] exclude individual and ifp from plan selection
"""
"""
[CLD-v2.2.1] independence mappings / one-off
"""
pd.read_sql(f"""
SELECT network_name, provider_id, provider_name, bill_type, count(*) as n
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_clickhouse_abridged
WHERE provider_id IN ('1349', '1248', '1155', '1330', '1270')
AND canonical_rate_score > 1
AND payer_id = '56'
GROUP BY 1, 2, 3, 4
ORDER BY 2, 1, 4
""", con=trino_conn)
"""
[CLD-v2.2.1] prod combined abridged to always populate medicare fields
"""
pd.read_sql(f"""
SELECT medicare_rate, canonical_rate
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_clickhouse_abridged
WHERE provider_type = 'Hospital'
AND canonical_rate IS NOT NULL
LIMIT 10
""", con=trino_conn)
"""
[CLD-v2.2.1] allowed amounts accuracy scoring
"""
pd.read_sql(f"""
SELECT network_name, provider_id, provider_name, bill_type, count(*) as n
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_clickhouse_abridged
WHERE provider_id IN ('1349', '1248', '1155', '1330', '1270')
AND canonical_rate_score > 1
AND payer_id = '56'
GROUP BY 1, 2, 3, 4
ORDER BY 2, 1, 4
""", con=trino_conn)
"""
[CLD-v2.2.1] use provider sub type for hospitals in Clickhouse
"""
pd.read_sql(f"""
SELECT DISTINCT provider_type
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_clickhouse_abridged
""", con=trino_conn)
"""
[CLD-v2.2.1] round rates to 2 decimals
"""
pd.read_sql("""
DESCRIBE tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
""", con=trino_conn)
pd.read_sql("""
SELECT
canonical_rate,
medicare_rate,
state_avg_medicare_rate,
cbsa_avg_medicare_rate,
asc_avg_medicare_rate,
lab_avg_medicare_rate,
asp_payment_limit,
canonical_gross_charge
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
LIMIT 10
""", con=trino_conn)
"""
[CLD-v2.2.1] Accuracy Macro Refactor
"""
for rate_col in ['payer_negotiated_rate', 'hospital_per_diem_allowed_amount', 'hospital_null_methodology_gc_hosp_perc_to_dol', 'opg_base_rate']:
df = pd.read_sql(f"""
SELECT
{rate_col},
{rate_col}_validation_score,
ABS(
normal_cdf(median, stddev, ln({rate_col}) - 0.05 * median) -
normal_cdf(median, stddev, ln({rate_col}) + 0.05 * median)
)
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_int_accuracy_brit_2025_08
WHERE {rate_col}_validation_score > 0
LIMIT 10
""", con=trino_conn)
print(df.to_markdown(index=False))
print('---')
print('\n\n')
"""
[CLD-v2.2.1] Plan Bridge Hierarchy for MO Anthem
"""
pd.read_sql(f"""
SELECT
payer_id, provider_id, billing_code_type, billing_code, contract_methodology, plan_name
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_ref_plan_bridge_base_2025_08
WHERE provider_id = '4058'
AND payer_id = 42
ORDER BY payer_id, provider_id, billing_code_type, billing_code, contract_methodology
LIMIT 100
""", con=trino_conn)
pd.read_sql(f"""
SELECT *
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_ref_plan_bridge_analysis_2025_08
WHERE provider_id = '4058'
AND payer_id = '42'
AND network_type = 'PPO'
LIMIT 100
""", con=trino_conn)
pd.read_sql(f"""
WITH
new AS (
SELECT unique_plans, count(*) as n_new
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_ref_plan_bridge_analysis_2025_08
WHERE provider_id = '4058'
AND payer_id = '42'
AND network_type = 'PPO'
AND network_name = 'MO Blue Access PPO'
GROUP BY 1
ORDER BY 2 DESC
),
old AS (
SELECT unique_plans, count(*) as n_old
FROM tq_dev.internal_dev_csong_cld_v2_2_0.tmp_ref_plan_bridge_analysis_2025_08
WHERE provider_id = '4058'
AND payer_id = '42'
AND network_type = 'PPO'
AND network_name = 'MO Blue Access PPO'
GROUP BY 1
ORDER BY 2 DESC
)
SELECT *
FROM new
FULL OUTER JOIN old USING (unique_plans)
ORDER BY 2 DESC
LIMIT 100
""", con=trino_conn)
"""
[CLD-v2.2.1] use state level validated median where available else national
"""
pd.read_sql(f"""
SELECT
payer_negotiated_untransformed_rate,
payer_negotiated_untransformed_rate_validation_score,
exp(median),
ABS(
normal_cdf(median, stddev, ln(payer_negotiated_untransformed_rate) - 0.05 * median) -
normal_cdf(median, stddev, ln(payer_negotiated_untransformed_rate) + 0.05 * median)
)
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_int_accuracy_brit_2025_08
WHERE
payer_id = '42'
AND provider_id = '6504'
AND network_id = -6972129921516377126
AND billing_code = '47562'
AND bill_type = 'Outpatient'
LIMIT 10
""", con=trino_conn)
pd.read_sql(f"""
SELECT approx_percentile(canonical_rate, 0.5)
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
WHERE
state = 'CA'
AND billing_code = '47562'
AND bill_type = 'Outpatient'
AND provider_type = 'Hospital'
""", con=trino_conn)
pd.read_sql(f"""
SELECT
count(*),
count(distinct provider_id),
avg(lower_bound),
avg(upper_bound)
FROM tq_intermediate.cld_utils.outlier_bounds_negotiated_rate_2025_10_18
UNION ALL
SELECT
count(*),
count(distinct provider_id),
avg(lower_bound),
avg(upper_bound)
FROM tq_intermediate.cld_utils.outlier_bounds_negotiated_rate_2025_09_26
""", con=trino_conn)
"""
[CLD-v2.2.1] Provider Subtype and Provider Type
"""
pd.read_sql(f"""
SELECT
c.provider_type,
c.provider_subtype,
count(*) as n
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged c
GROUP BY 1, 2
""", con=trino_conn)
"""
[CLD-v2.2.1] ROID algorithm update
"""
new = pd.read_sql(f"""
SELECT roid
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_rate_object_space_2025_08
LIMIT 1
""", con=trino_conn)
old = pd.read_sql(f"""
SELECT roid
FROM tq_dev.internal_dev_csong_cld_v2_2_0.tmp_rate_object_space_2025_08
LIMIT 1
""", con=trino_conn)
print(len(new['roid'].values[0]))
print(len(str(old['roid'].values[0])))
"""
[CLD-v2.2.1] provider_npi remove nulls from arrays
"""
pd.read_sql(f"""
select p.npi, n.npi
from tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged p cross join unnest (npi) n(npi)
where n.npi is null
AND p.npi IS NOT NULL
LIMIT 10
""", con=trino_conn)
"""
[CLD-v2.2.1] revert to official medicare reference tables
"""
df = pd.read_sql(f"""
WITH
new AS (
SELECT provider_id, billing_code, AVG(medicare_rate) AS avg_medicare_rate
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
WHERE provider_type = 'Hospital'
GROUP BY 1,2
),
old AS (
SELECT provider_id, billing_code, AVG(medicare_rate) AS avg_medicare_rate
FROM tq_dev.internal_dev_csong_cld_v2_2_0.prod_combined_abridged
WHERE provider_type LIKE '%Hospital%'
GROUP BY 1,2
)
SELECT
provider_id,
billing_code,
new.avg_medicare_rate AS new_avg_medicare_rate,
old.avg_medicare_rate AS old_avg_medicare_rate,
new.avg_medicare_rate - old.avg_medicare_rate AS rate_diff,
(new.avg_medicare_rate - old.avg_medicare_rate) / old.avg_medicare_rate AS pct_diff
FROM new
INNER JOIN old USING (provider_id, billing_code)
ORDER BY RANDOM()
LIMIT 20000
""", con=trino_conn)
df['pct_diff'].describe(percentiles=[0.5, 0.9, 0.95, 0.99])
"""
[CLD-v2.2.1] state name full (e.g. California (CA))
"""
df = pd.read_sql(f"""
SELECT distinct state, state_name_full
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
LIMIT 100
""", con=trino_conn)
print(df.to_markdown(index=False))
"""
[CLD-v2.2.1] Physician Groups Lab Fee Schedule
"""
df = pd.read_sql(f"""
SELECT provider_type, medicare_reference_source, count(*) as n
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
GROUP BY 1,2
""", con=trino_conn)
df
pd.read_sql(f"""
select
distinct
provider_type,
medicare_reference_source,
count(distinct billing_code)
from tq_dev.internal_dev_csong_cld_v2_2_0.prod_combined_abridged
where canonical_rate is not null
and is_lab_code
group by 1,2
order by 3 desc
""", con=trino_conn)
pd.read_sql(f"""
select
distinct
provider_type,
medicare_reference_source,
count(distinct billing_code)
from tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
where canonical_rate is not null
and is_lab_code
group by 1,2
order by 3 desc
""", con=trino_conn)
"""
[CLD-v2.2.1] Anesthesia Prioritized over Raw Rates
"""
pd.read_sql(f"""
SELECT payer_negotiated_rate_anesthesia_cf, canonical_rate, canonical_rate_score, canonical_rate_type, medicare_rate
FROM tq_dev.internal_dev_csong_cld_v2_2_0.tmp_int_combined_2025_08
WHERE provider_name = 'Austin Anesthesiology Group Pllc'
AND payer_id = '169'
AND billing_code = '00100'
""", con=trino_conn)
pd.read_sql(f"""
SELECT payer_negotiated_rate_anesthesia_cf, canonical_rate, canonical_rate_score, canonical_rate_type, medicare_rate
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_int_combined_2025_08
WHERE provider_name = 'Austin Anesthesiology Group Pllc'
AND payer_id = '169'
AND billing_code = '00100'
""", con=trino_conn)
"""
[CLD-v2.2.1] Premera AK
"""
pd.read_sql(f"""
SELECT
payer_id,
payer_name,
network_name,
state,
count(*) as n_rates
FROM tq_dev.internal_dev_csong_cld_v2_2_1.prod_combined_abridged
WHERE payer_id = '628'
AND canonical_rate_score > 0
GROUP BY 1,2,3,4
""", con=trino_conn)
"""
[CLD-v2.2.1] exclude individual and ifp from plan selection
"""
pd.read_sql(f"""
SELECT *
FROM tq_dev.internal_dev_csong_cld_v2_2_1.tmp_ref_plan_bridge_analysis_2025_08,
UNNEST(unique_plans) AS t(unique_plan)
WHERE (
t.unique_plan LIKE '%Individual%'
OR t.unique_plan LIKE '%IFP%'
OR t.unique_plan LIKE '%INDIVIDUAL%'
)
AND network_type IN ('PPO', 'HMO')
AND network_class != 'Exchange'
LIMIT 10
""", con=trino_conn)
"""
Compare Schemas
"""
t1 = pd.read_sql(f"""
DESCRIBE tq_dev.internal_dev_csong_cld_v2_2_1.prod_clickhouse_abridged
""", con=trino_conn)
t2 = pd.read_sql(f"""
DESCRIBE tq_dev.internal_dev_csong_cld_v2_2_0.prod_clickhouse_abridged
""", con=trino_conn)
df_merged = t1.merge(t2, on='Column', how='outer', suffixes=('_v2_2_1', '_v2_2_0'))
df_merged.loc[df_merged['Type_v2_2_1'] == df_merged['Type_v2_2_0']]