source: main/trunk/db/src/main/scripts/mysql/migrate/10_data-dataset.sql @ 25198

Last change on this file since 25198 was 25198, checked in by GarthBraithwaite_STG, 3 months ago

db - updated the ds create to key off of SO3 and "Country" and cleaned out dup SO1 and SO3 usage. Added map_name cleanup to set to None vs DEFAULT. Added more select tests. Added test for VALUE_ATTR and null measure value.

File size: 16.6 KB
Line 
1/*
2        The point of this table is to create a mapping between the IV/IVV tables
3        and the new DATASET table.  This sql script builds a temp table so that the
4        unique combinations of IPV names, dimension names, data sources, measure
5        name(s) are captured.  This requires a cleaned up and standardized DS to IV
6        sort order (see below).  The DS name is then built and updated where the
7        actual DS record can then reference this table based on that record's dims
8        to determine the approp dataset to be put into.
9       
10        This table once created and initially populated could be exported to Excel
11        where Kim/Lois can then make decisions for data clean up.  It could then be
12        imported and the balance of the scripts ran which are based off of this TMP
13        table.
14
15        DATASET DATASOURCE NOTES:
16        Datasets are based on a singular numerator datasource.  Numerator data
17        sources are determined by the DATA_SOURCE_TO_INDICATOR_VIEW's SORT_ORDER.
18        Lois standardized/updated version 2.3 to have all SO=3's to be the US
19        dataset with SO=1 being the standard numerator for non US datasets.  SO=2
20        is an optional denom source for the SO=1 numerator.  SO3 should all be
21        inserted straight up as both numer and denom.
22
23        NOTES: The special SO3 splits is semi handled.  For example BRFSS_Wgt is a
24        special split usage.  The IP has a SO3 with CDC_BRFSS and a SO1 with a data
25        source of BRFSS.  So the script that populates will only need to test for
26        specific dimenison values of say 1&2 or 3&4 and insert into the approp ds.
27
28        See previous 3/14/2018 commits for other notes and approaches. 
29*/
30
31/* Before beginning do a quick test of IVVs to see what DS to IV SORT_ORDER=3
32        records are missing.  If any the DS to IV needs to be updated with approp SO3s.
33*/
34select ivv.INDICATOR_VIEW_NAME
35from INDICATOR_VIEW_VALUE ivv
36where ivv.VALUE is not null
37  and ivv.VALUE not like '*%'
38  and ivv.VALUE > ''
39  and ivv.ACTIVE_FLAG = 'x'
40  and ((DIMENSION_1_NAME = 'Country') or (DIMENSION_2_NAME = 'Country'))
41  and ivv.INDICATOR_VIEW_NAME not in 
42        (
43                select dstiv.INDICATOR_VIEW_NAME 
44                from DATA_SOURCE_TO_INDICATOR_VIEW dstiv
45                where dstiv.SORT_ORDER = 3
46        )
47group by ivv.INDICATOR_VIEW_NAME
48;
49
50
51drop table if exists TMP_INDICATOR_VIEW_DATASET;
52
53create table TMP_INDICATOR_VIEW_DATASET(
54  INDICATOR_NAME                  varchar  (100),
55  INDICATOR_VIEW_NAME             varchar  (200),
56  INDICATOR_VIEW_ACTIVE_FLAG      varchar    (1),
57  MEASURE_NAME                    varchar  (100),
58  NUMERATOR_DATA_SOURCE_NAME      varchar  (100),
59  DATA_SOURCE_SORT_ORDER          int             default null,
60  PERIOD_DIMENSION_NAME           varchar  (100),
61  DIMENSION_1_NAME                varchar  (100),
62  DIMENSION_2_NAME                varchar  (100),
63  DATASET_NAME                    varchar  (200)
64);
65
66
67delete from TMP_INDICATOR_VIEW_DATASET;
68
69
70/* basic counts */
71select count(*) from INDICATOR_VIEW;
72select count(*) from DATA_SOURCE_TO_INDICATOR_VIEW;
73
74/* The standard test for IVs that don't have a data source */
75select * from DATA_SOURCE_TO_INDICATOR_VIEW
76where INDICATOR_VIEW_NAME not in 
77        (select NAME from INDICATOR_VIEW)
78;
79
80/* Another way to check views (active) that don't have a DS */
81select NAME, ACTIVE_FLAG
82from INDICATOR_VIEW
83where NAME not in (select INDICATOR_VIEW_NAME from DATA_SOURCE_TO_INDICATOR_VIEW)
84;
85
86/* Count of IVVs that won't be picked up because no DS to IV mapping. */
87select iv.NAME, iv.ACTIVE_FLAG, count(ivv.INDICATOR_VIEW_NAME)
88from INDICATOR_VIEW iv, INDICATOR_VIEW_VALUE ivv
89where iv.NAME not in (select INDICATOR_VIEW_NAME from DATA_SOURCE_TO_INDICATOR_VIEW)
90  and ivv.INDICATOR_VIEW_NAME = iv.NAME
91;
92
93
94
95/* Insert all the SO1. */
96insert into TMP_INDICATOR_VIEW_DATASET (
97                INDICATOR_NAME, INDICATOR_VIEW_NAME, 
98                NUMERATOR_DATA_SOURCE_NAME, DATA_SOURCE_SORT_ORDER,
99                PERIOD_DIMENSION_NAME, DIMENSION_1_NAME, DIMENSION_2_NAME
100        )
101select  substring_index(ivv.INDICATOR_VIEW_NAME, '.', 1), ivv.INDICATOR_VIEW_NAME, 
102        dstiv.DATA_SOURCE_NAME, dstiv.SORT_ORDER,
103        ivv.PERIOD_DIMENSION_NAME, ivv.DIMENSION_1_NAME, ifnull(ivv.DIMENSION_2_NAME, 'NA')
104from INDICATOR_VIEW_VALUE ivv, DATA_SOURCE_TO_INDICATOR_VIEW dstiv
105where dstiv.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
106  and dstiv.SORT_ORDER = 1
107group by 2, 3, 4, 5, 6, 7
108;
109
110/* quick test to see what DS_TO_IV do not have a SO1 */
111select distinct INDICATOR_VIEW_NAME
112from TMP_INDICATOR_VIEW_DATASET
113where INDICATOR_VIEW_NAME not in 
114        (select INDICATOR_VIEW_NAME
115         from DATA_SOURCE_TO_INDICATOR_VIEW
116        )
117;
118
119
120
121/* Insert the US SO3 entries */
122insert into TMP_INDICATOR_VIEW_DATASET (
123                INDICATOR_NAME, INDICATOR_VIEW_NAME, 
124                NUMERATOR_DATA_SOURCE_NAME, DATA_SOURCE_SORT_ORDER,
125                PERIOD_DIMENSION_NAME, DIMENSION_1_NAME, DIMENSION_2_NAME
126        )
127select  substring_index(ivv.INDICATOR_VIEW_NAME, '.', 1), ivv.INDICATOR_VIEW_NAME, 
128        dstiv.DATA_SOURCE_NAME, dstiv.SORT_ORDER,
129        ivv.PERIOD_DIMENSION_NAME, ivv.DIMENSION_1_NAME, ifnull(ivv.DIMENSION_2_NAME, 'NA')
130from INDICATOR_VIEW_VALUE ivv, DATA_SOURCE_TO_INDICATOR_VIEW dstiv
131where dstiv.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
132  and dstiv.SORT_ORDER = 3
133  and ((ivv.DIMENSION_1_NAME = 'Country') or (ivv.DIMENSION_2_NAME = 'Country'))
134group by 2, 3, 4, 5, 6, 7
135;
136
137
138/* remove the US SO1 entries where SO 3 exists */
139select *
140from TMP_INDICATOR_VIEW_DATASET
141where DATA_SOURCE_SORT_ORDER = 1
142  and ((DIMENSION_1_NAME = 'Country') or (DIMENSION_2_NAME = 'Country'))
143  and INDICATOR_VIEW_NAME in
144        (select INDICATOR_VIEW_NAME
145         from TMP_INDICATOR_VIEW_DATASET
146         where DATA_SOURCE_SORT_ORDER = 3
147        )
148;
149
150delete tivd1 from TMP_INDICATOR_VIEW_DATASET tivd1
151join TMP_INDICATOR_VIEW_DATASET tivd2
152  on tivd2.INDICATOR_VIEW_NAME = tivd1.INDICATOR_VIEW_NAME
153where tivd1.DATA_SOURCE_SORT_ORDER = 1
154  and ((tivd1.DIMENSION_1_NAME = 'Country') or (tivd1.DIMENSION_2_NAME = 'Country'))
155  and tivd2.DATA_SOURCE_SORT_ORDER = 3
156;
157
158
159/*  The SO3 insert is very specific to only include those records that have a
160        US related dimension (Country or GeoCountry).  Inserting the blanket SO1
161        in some cases results in some dup US type entries for SO1 and some dup
162        NON US entries for SO3.
163
164        Test to make sure there are not any SO1 and SO3 both having the same data
165        source name.  If there are some then need to adjust the DS to IV accordingly.
166*/
167select *
168from TMP_INDICATOR_VIEW_DATASET tivd
169where DATA_SOURCE_SORT_ORDER = 1
170  and INDICATOR_VIEW_NAME in 
171        (select tivd2.INDICATOR_VIEW_NAME
172         from TMP_INDICATOR_VIEW_DATASET tivd2
173         where tivd2.DATA_SOURCE_SORT_ORDER = 3
174           and tivd2.NUMERATOR_DATA_SOURCE_NAME = tivd.NUMERATOR_DATA_SOURCE_NAME
175        )
176order by INDICATOR_VIEW_NAME, NUMERATOR_DATA_SOURCE_NAME, DIMENSION_1_NAME,  DIMENSION_2_NAME
177;
178
179/* If needing to delete the SO1 DS then could do something like:
180delete
181from TMP_INDICATOR_VIEW_DATASET t2
182where DATA_SOURCE_SORT_ORDER = 1
183  and INDICATOR_VIEW_NAME in
184        (select t1.INDICATOR_VIEW_NAME from
185                (select INDICATOR_VIEW_NAME
186                 from TMP_INDICATOR_VIEW_DATASET
187                 where DATA_SOURCE_SORT_ORDER = 3
188                   and t2.DATA_SOURCE_NAME = t1.DATA_SOURCE_NAME
189                ) t1
190        )
191;
192*/
193
194
195/* set the measaure name, active flag, and update the IP name - just in case */
196update TMP_INDICATOR_VIEW_DATASET tmp
197        inner join 
198                (select NAME, INDICATOR_NAME, MEASURE_NAME, ACTIVE_FLAG from INDICATOR_VIEW) iv
199                on iv.NAME = tmp.INDICATOR_VIEW_NAME
200set tmp.INDICATOR_VIEW_ACTIVE_FLAG = iv.ACTIVE_FLAG,
201        tmp.MEASURE_NAME               = iv.MEASURE_NAME,
202        tmp.INDICATOR_NAME             = iv.INDICATOR_NAME
203;
204
205/* set the dataset name */
206update TMP_INDICATOR_VIEW_DATASET
207set DATASET_NAME = concat(INDICATOR_NAME, '.', MEASURE_NAME, '.', NUMERATOR_DATA_SOURCE_NAME)
208where DATASET_NAME is null
209;
210
211
212/* check any temp DS that do NOT have a DS NAME */
213select * from TMP_INDICATOR_VIEW_DATASET where DATASET_NAME is null;
214
215/* List of temp IV datasets
216select INDICATOR_NAME, DATASET_NAME, count(DATASET_NAME)
217from TMP_INDICATOR_VIEW_DATASET
218group by DATASET_NAME, INDICATOR_NAME
219;
220*/
221
222
223
224
225/* create the DATASET records based on the temp table */
226delete from DATASET;
227
228insert into DATASET (
229        NAME, INDICATOR_NAME, 
230        TITLE,
231        NUMERATOR, DENOMINATOR, DATA_NOTE, 
232        MEASURE_NAME, SORT_ORDER, ACTIVE_FLAG, MODIFIED_DATE)
233select 
234        tivd.DATASET_NAME, tivd.INDICATOR_NAME, 
235        concat('TITLE: ', tivd.MEASURE_NAME, ', ', tivd.NUMERATOR_DATA_SOURCE_NAME),
236        i.NUMERATOR, i.DENOMINATOR, substring(i.DATA_NOTE, 1, 1000), 
237        tivd.MEASURE_NAME, iv.SORT_ORDER, 'x', iv.MODIFIED_DATE
238from INDICATOR i, INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd
239where i.NAME  = tivd.INDICATOR_NAME
240  and iv.NAME = tivd.INDICATOR_VIEW_NAME
241group by tivd.DATASET_NAME
242;
243
244/* basic count of temp datasets to actual */
245select count(*) from TMP_INDICATOR_VIEW_DATASET;
246select count(*) from DATASET;
247
248
249/* update any null dataset data notes via the IPV */
250update DATASET ds set DATA_NOTE = 
251        (select iv.DATA_NOTE
252         from INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd
253         where iv.NAME = tivd.INDICATOR_VIEW_NAME
254           and tivd.DATASET_NAME = ds.NAME
255           and iv.DATA_NOTE is not null
256         limit 1
257        )
258where ds.DATA_NOTE is null;
259
260/* update the title and desc via the DS and MEASURE */
261update DATASET d set TITLE = 
262        ifnull( substr(
263                (select concat(ds.NAME, ', ', m.TITLE)
264                 from INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd, MEASURE m, DATA_SOURCE ds
265                 where iv.NAME = tivd.INDICATOR_VIEW_NAME
266                   and d.NAME  = tivd.DATASET_NAME
267                   and ds.NAME = tivd.NUMERATOR_DATA_SOURCE_NAME
268                   and m.NAME  = tivd.MEASURE_NAME
269                 limit 1
270                ), 1, 100)
271        , '')
272;
273update DATASET d set DESCRIPTION = 
274        (select concat('Dataset of ', m.TITLE, ' for ', ds.TITLE)
275         from INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd, MEASURE m, DATA_SOURCE ds
276         where iv.NAME = tivd.INDICATOR_VIEW_NAME
277           and d.NAME  = tivd.DATASET_NAME
278           and ds.NAME = tivd.NUMERATOR_DATA_SOURCE_NAME
279           and m.NAME  = tivd.MEASURE_NAME
280         limit 1
281        )
282;
283
284
285
286
287/* Insert the DATA_SOURCE_TO_DATASET records.
288        Indicator view to data source sort order which specifies 1=numerator, 2=denomn, 3=US
289        1 only:  1 = numer, 1 = denom
290        1, 2:    1 = numer, 2 = denom
291        1, 2, 3: 1 = State numer, 2 = State denom  ,,  3 = US numer, US denom
292        1, 3:    1 = State numer, 1 = State denom  ,,  3 = US numer, US denom
293        3 only:  3 = US numer, 3 = US denom
294*/
295/* The numerator data source in the temp table is ALWAYS the numerator DS. */
296delete from DATA_SOURCE_TO_DATASET;
297
298insert into DATA_SOURCE_TO_DATASET (DATASET_NAME, DATA_SOURCE_NAME, DATA_SOURCE_USAGE)
299select DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME, 'numerator'
300from TMP_INDICATOR_VIEW_DATASET
301group by DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME
302;
303
304/* As a general catch all set the denom datasource to the numerator DS.  This
305        is accurate for both only 1, or only 1&3 e.g. no SO2.
306*/
307insert into DATA_SOURCE_TO_DATASET (DATASET_NAME, DATA_SOURCE_NAME, DATA_SOURCE_USAGE)
308select DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME, 'denominator'
309from TMP_INDICATOR_VIEW_DATASET
310group by DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME
311;
312
313/* Need to update the denominator dataset datasource usage when there is a
314        specific denom DS (sort order 2).   Note the use of ifnull to set it to
315        itself if no SO2 record exists (e.g. don't set it to null).
316
317        NOTES: the ds to IV has around 500 SO2's.  However, many of those IV's are
318        consolidated into fewer datasets.  So update of dataset to SO2 ==> 117
319        which is inline with the ratio of IVs to DSs.
320
321        The select returns approx 135 rows of SO2 for the assoc IPs.  The update
322        results in approx 150 rows updated.
323*/
324select *
325from DATA_SOURCE_TO_INDICATOR_VIEW
326where SORT_ORDER = 2
327group by substring_index(INDICATOR_VIEW_NAME, '.', 1), DATA_SOURCE_NAME
328;
329
330select * from DATA_SOURCE_TO_DATASET dstd
331where DATA_SOURCE_NAME = (select dstiv.DATA_SOURCE_NAME
332         from DATA_SOURCE_TO_INDICATOR_VIEW dstiv, TMP_INDICATOR_VIEW_DATASET tivd
333         where tivd.DATASET_NAME = dstd.DATASET_NAME
334           and tivd.DATA_SOURCE_SORT_ORDER = 1
335           and dstiv.INDICATOR_VIEW_NAME = tivd.INDICATOR_VIEW_NAME
336           and dstiv.SORT_ORDER = 2
337         group by tivd.DATASET_NAME, dstiv.DATA_SOURCE_NAME
338         limit 1
339        )
340  and DATA_SOURCE_USAGE = 'denominator'
341;
342
343
344update DATA_SOURCE_TO_DATASET dstd
345set DATA_SOURCE_NAME = ifnull(
346        (select dstiv.DATA_SOURCE_NAME
347         from DATA_SOURCE_TO_INDICATOR_VIEW dstiv, TMP_INDICATOR_VIEW_DATASET tivd
348         where tivd.DATASET_NAME = dstd.DATASET_NAME
349           and tivd.DATA_SOURCE_SORT_ORDER = 1
350           and dstiv.INDICATOR_VIEW_NAME = tivd.INDICATOR_VIEW_NAME
351           and dstiv.SORT_ORDER = 2
352         group by tivd.DATASET_NAME, dstiv.DATA_SOURCE_NAME
353         limit 1
354        ), DATA_SOURCE_NAME)  /* use 'garth' to test which ones do NOT have a SO of 2 of the 322 exactly 1/2 do not.... */
355where DATA_SOURCE_USAGE = 'denominator'
356;
357
358/* TEST: use 'garth' to test which ones do NOT have a SO of 2 of the 307 out of 894
359        ), concat('garth', DATA_SOURCE_NAME) ) 
360
361select * from DATA_SOURCE_TO_DATASET where data_source_name like 'garth%';
362*/
363
364
365
366/* DATASET to IV */
367delete from DATASET_TO_INDICATOR_VIEW;
368
369insert into DATASET_TO_INDICATOR_VIEW(DATASET_NAME, INDICATOR_VIEW_NAME)
370select DATASET_NAME, INDICATOR_VIEW_NAME
371from TMP_INDICATOR_VIEW_DATASET
372where DATASET_NAME        is not null
373  and INDICATOR_VIEW_NAME is not null
374group by INDICATOR_VIEW_NAME, DATASET_NAME
375;
376
377
378
379/* create the DIMENSION to DATASET records based on the dataset's records.  Prior
380        to 10/5/16 these records were based on the view.
381       
382        5/5/2017, changed logic to use the tmp table and not join on the IVV.
383*/
384delete from DIMENSION_TO_DATASET;
385
386insert into DIMENSION_TO_DATASET(DATASET_NAME, DIMENSION_NAME, DIMENSION_USAGE)
387select tivd.DATASET_NAME, tivd.PERIOD_DIMENSION_NAME, 'period' 
388from TMP_INDICATOR_VIEW_DATASET tivd
389group by 1, 2
390;
391
392insert into DIMENSION_TO_DATASET(DATASET_NAME, DIMENSION_NAME, DIMENSION_USAGE)
393select tivd.DATASET_NAME, tivd.DIMENSION_1_NAME, 'dimension1' 
394from TMP_INDICATOR_VIEW_DATASET tivd
395where tivd.DIMENSION_1_NAME is not null
396group by 1, 2
397;
398
399insert into DIMENSION_TO_DATASET(DATASET_NAME, DIMENSION_NAME, DIMENSION_USAGE)
400select tivd.DATASET_NAME, tivd.DIMENSION_2_NAME, 'dimension2' 
401from TMP_INDICATOR_VIEW_DATASET tivd
402where tivd.DIMENSION_2_NAME is not null
403  and not exists 
404         (select 1
405           from DIMENSION_TO_DATASET dtd2
406           where dtd2.DIMENSION_USAGE = 'dimension1'
407                 and dtd2.DIMENSION_NAME  = tivd.DIMENSION_2_NAME
408                 and dtd2.DATASET_NAME    = tivd.DATASET_NAME
409         )
410group by 1, 2
411;
412
413
414
415/* check - make sure no dupicate dim1/dim2 usages. */
416select * from DIMENSION_TO_DATASET dtd1
417where dtd1.DIMENSION_USAGE = 'dimension2'
418  and exists 
419         (select 1
420           from DIMENSION_TO_DATASET dtd2
421           where dtd2.DIMENSION_USAGE = 'dimension1'
422                 and dtd2.DIMENSION_NAME  = dtd1.DIMENSION_NAME
423                 and dtd2.DATASET_NAME    = dtd1.DATASET_NAME
424         )
425;
426
427delete from DIMENSION_TO_DATASET
428where DIMENSION_USAGE = 'dimension2'
429  and (DATASET_NAME, DIMENSION_NAME) in 
430        (select dtd.DATASET_NAME, dtd.DIMENSION_NAME
431         from (select DATASET_NAME, DIMENSION_NAME
432                   from DIMENSION_TO_DATASET
433                   where DIMENSION_USAGE = 'dimension1'
434                  ) as dtd
435        )
436;
437
438
439/* clean out the NA dims */
440delete from DIMENSION_TO_DATASET where DIMENSION_NAME = 'NA';
441
442
443
444/* Set the dataset's DATA_ISSUE to the associated IP.DATA_NOTE field.  This is
445        done via the DATASOURCE_TO_IV and the IP.DEFAULT_INDICATOR_VIEW_NAME
446        and the SORT_ORDER being 1 (which is the numerator data source).
447*/
448update DATA_SOURCE ds
449set ds.DATA_ISSUES = 
450        (select i.DATA_NOTE
451         from INDICATOR i
452         where i.DATA_NOTE is not null
453           and i.DEFAULT_INDICATOR_VIEW_NAME = 
454                (select dstiv.INDICATOR_VIEW_NAME
455                 from DATA_SOURCE_TO_INDICATOR_VIEW dstiv
456                 where dstiv.DATA_SOURCE_NAME = ds.NAME
457                   and dstiv.SORT_ORDER = 1
458                 group by dstiv.DATA_SOURCE_NAME, dstiv.INDICATOR_VIEW_NAME
459                 limit 1
460                )
461        ) 
462where ds.DATA_ISSUES is null
463;
464
465update DATA_SOURCE ds
466set ds.DATA_ISSUES = 
467        (select i.DATA_NOTE
468         from INDICATOR i
469         where i.DATA_NOTE is not null
470           and i.DEFAULT_INDICATOR_VIEW_NAME = 
471                (select dstiv.INDICATOR_VIEW_NAME
472                 from DATA_SOURCE_TO_INDICATOR_VIEW dstiv
473                 where dstiv.DATA_SOURCE_NAME = ds.NAME
474                 group by dstiv.DATA_SOURCE_NAME, dstiv.INDICATOR_VIEW_NAME
475                 limit 1
476                )
477        ) 
478where ds.DATA_ISSUES is null
479;
480
481update DATA_SOURCE ds
482set ds.DATA_ISSUES = 
483        (select i.DATA_NOTE
484         from INDICATOR i, INDICATOR_VIEW iv, DATA_SOURCE_TO_INDICATOR_VIEW dstiv
485         where i.DATA_NOTE is not null
486           and i.NAME = iv.INDICATOR_NAME
487           and iv.NAME = dstiv.INDICATOR_VIEW_NAME
488           and dstiv.DATA_SOURCE_NAME = ds.NAME
489         group by dstiv.DATA_SOURCE_NAME, dstiv.INDICATOR_VIEW_NAME
490         limit 1
491        ) 
492where ds.DATA_ISSUES is null
493;
494/*------------------------------ END OF FILE ------------------------------*/
Note: See TracBrowser for help on using the repository browser.