source: main/trunk/db/src/main/scripts/mysql/migrate/10_data-dataset.sql @ 25095

Last change on this file since 25095 was 25095, checked in by GarthBraithwaite_STG, 4 months ago

db - minor syntax tweaks.

File size: 16.3 KB
Line 
1/*
2        The point of this table is to create a mapping between the IV/IVV tables
3        and the new DATASET table.  This sql script builds a temp table so that the
4        unique combinations of IPV names, dimension names, data sources, measure
5        name(s) are captured.  This requires a cleaned up and standardized DS to IV
6        sort order (see below).  The DS name is then built and updated where the
7        actual DS record can then reference this table based on that record's dims
8        to determine the approp dataset to be put into.
9       
10        This table once created and initially populated could be exported to Excel
11        where Kim/Lois can then make decisions for data clean up.  It could then be
12        imported and the balance of the scripts ran which are based off of this TMP
13        table.
14
15        DATASET DATASOURCE NOTES:
16        Datasets are based on a singular numerator datasource.  Numerator data
17        sources are determined by the DATA_SOURCE_TO_INDICATOR_VIEW's SORT_ORDER.
18        Lois standardized/updated version 2.3 to have all SO=3's to be the US
19        dataset.  SO=1 is the standard numerator for non US data.  SO=2 is the denom
20        source for the SO=1 numerator.  As such here's the rules:
21        1) GeoCountry & SO3 should all be inserted straight up.
22        2) Insert the SO3 special split usages like:
23                'ACA_Pre_Post', 'BRFSS_Wgt', 'SexStateUS', 'Girls15-17_18-19', 'DiagPriAny'.
24                This is needed so that an actual dataset with the approp measure name
25                and data source name will be created.  The data-dataset_record script
26                will handle/create the approp split records.
27        3) All SO 1 should be processed straight up. 
28
29        NOTES: The special SO3 splits is semi handled.  For example BRFSS_Wgt is a
30        special split usage.  The IP has a SO3 with CDC_BRFSS and a SO1 with a data
31        source of BRFSS.  So the script that populates will only need to test for
32        specific dimenison values of say 1&2 or 3&4 and insert into the approp ds.
33
34        See previous 3/14/2018 commits for other notes and approaches. 
35*/
36
37/* quick test to see what DS to IV SORT_ORDER=3 records are missing */
38select ivv.INDICATOR_VIEW_NAME
39from INDICATOR_VIEW_VALUE ivv
40where ivv.VALUE is not null
41  and ivv.VALUE not like '*%'
42  and ivv.VALUE > ''
43  and ivv.ACTIVE_FLAG = 'x'
44  and ((DIMENSION_1_NAME = 'GeoCountry') or (DIMENSION_2_NAME = 'GeoCountry'))
45  and ivv.INDICATOR_VIEW_NAME not in 
46        (
47                select dstiv.INDICATOR_VIEW_NAME 
48                from DATA_SOURCE_TO_INDICATOR_VIEW dstiv
49                where dstiv.SORT_ORDER = 3
50        )
51group by ivv.INDICATOR_VIEW_NAME
52;
53
54
55drop table if exists TMP_INDICATOR_VIEW_DATASET;
56
57create table TMP_INDICATOR_VIEW_DATASET(
58  INDICATOR_NAME                  varchar  (100),
59  INDICATOR_VIEW_NAME             varchar  (200),
60  INDICATOR_VIEW_ACTIVE_FLAG      varchar    (1),
61  MEASURE_NAME                    varchar  (100),
62  NUMERATOR_DATA_SOURCE_NAME      varchar  (100),
63  DATA_SOURCE_SORT_ORDER          int             default null,
64  PERIOD_DIMENSION_NAME           varchar  (100),
65  DIMENSION_1_NAME                varchar  (100),
66  DIMENSION_2_NAME                varchar  (100),
67  DATASET_NAME                    varchar  (200)
68);
69
70
71delete from TMP_INDICATOR_VIEW_DATASET;
72
73
74/* do some basic tests to make sure all IPVs have a data source */
75select count(*) from INDICATOR_VIEW;
76select count(*) from DATA_SOURCE_TO_INDICATOR_VIEW;
77
78select NAME, ACTIVE_FLAG
79from INDICATOR_VIEW
80where NAME not in (select INDICATOR_VIEW_NAME from DATA_SOURCE_TO_INDICATOR_VIEW)
81;
82
83select iv.NAME, iv.ACTIVE_FLAG, count(ivv.INDICATOR_VIEW_NAME)
84from INDICATOR_VIEW iv, INDICATOR_VIEW_VALUE ivv
85where iv.NAME not in (select INDICATOR_VIEW_NAME from DATA_SOURCE_TO_INDICATOR_VIEW)
86  and ivv.INDICATOR_VIEW_NAME = iv.NAME
87;
88
89select * from DATA_SOURCE_TO_INDICATOR_VIEW
90where INDICATOR_VIEW_NAME not in 
91        (select NAME from INDICATOR_VIEW)
92;
93
94
95
96/* Insert all the SO1. */
97insert into TMP_INDICATOR_VIEW_DATASET (
98                INDICATOR_NAME, INDICATOR_VIEW_NAME, 
99                NUMERATOR_DATA_SOURCE_NAME, DATA_SOURCE_SORT_ORDER,
100                PERIOD_DIMENSION_NAME, DIMENSION_1_NAME, DIMENSION_2_NAME
101        )
102select  substring_index(ivv.INDICATOR_VIEW_NAME, '.', 1), ivv.INDICATOR_VIEW_NAME, 
103        dstiv.DATA_SOURCE_NAME, dstiv.SORT_ORDER,
104        ivv.PERIOD_DIMENSION_NAME, ivv.DIMENSION_1_NAME, ifnull(ivv.DIMENSION_2_NAME, 'NA')
105from INDICATOR_VIEW_VALUE ivv, DATA_SOURCE_TO_INDICATOR_VIEW dstiv
106where dstiv.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
107  and dstiv.SORT_ORDER = 1
108group by 2, 3, 4, 5, 6, 7
109;
110
111/* quick test to see what DS_TO_IV is not used */
112select distinct INDICATOR_VIEW_NAME
113from TMP_INDICATOR_VIEW_DATASET
114where INDICATOR_VIEW_NAME not in 
115        (select INDICATOR_VIEW_NAME
116         from DATA_SOURCE_TO_INDICATOR_VIEW
117        )
118;
119
120
121
122/* Insert the US SO3 entries */
123insert into TMP_INDICATOR_VIEW_DATASET (
124                INDICATOR_NAME, INDICATOR_VIEW_NAME, 
125                NUMERATOR_DATA_SOURCE_NAME, DATA_SOURCE_SORT_ORDER,
126                PERIOD_DIMENSION_NAME, DIMENSION_1_NAME, DIMENSION_2_NAME
127        )
128select  substring_index(ivv.INDICATOR_VIEW_NAME, '.', 1), ivv.INDICATOR_VIEW_NAME, 
129        dstiv.DATA_SOURCE_NAME, dstiv.SORT_ORDER,
130        ivv.PERIOD_DIMENSION_NAME, ivv.DIMENSION_1_NAME, ifnull(ivv.DIMENSION_2_NAME, 'NA')
131from INDICATOR_VIEW_VALUE ivv, DATA_SOURCE_TO_INDICATOR_VIEW dstiv
132where dstiv.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
133  and dstiv.SORT_ORDER = 3
134group by 2, 3, 4, 5, 6, 7
135;
136
137
138/*  The SO3 insert is very specific to only include those records that have a
139        US related dimension.  Inserting the blanket SO1 results in some dup US type
140        entries as the insert keys off of the view name, dataset, SO, and DIM 1 & 2. 
141        Thus need to delete US SO1 if there's a US SO3 record.  Also need to delete
142        any NON US SO3's from the SO1 *IF* the data source name matches.  Otherwise
143        it needs to stay as those non US SO3's are the split use entries.
144       
145        Note that the "select" and "delete" counts should match.
146*/
147select *
148from TMP_INDICATOR_VIEW_DATASET
149where DATA_SOURCE_SORT_ORDER = 1
150  and INDICATOR_VIEW_NAME in 
151        (select INDICATOR_VIEW_NAME
152         from TMP_INDICATOR_VIEW_DATASET
153         where DATA_SOURCE_SORT_ORDER = 3
154        )
155order by INDICATOR_VIEW_NAME, NUMERATOR_DATA_SOURCE_NAME, DIMENSION_1_NAME,  DIMENSION_2_NAME
156;
157
158delete 
159from TMP_INDICATOR_VIEW_DATASET
160where DATA_SOURCE_SORT_ORDER = 1
161  and INDICATOR_VIEW_NAME in 
162        (select t1.INDICATOR_VIEW_NAME from 
163                (select INDICATOR_VIEW_NAME
164                 from TMP_INDICATOR_VIEW_DATASET
165                 where DATA_SOURCE_SORT_ORDER = 3
166                ) t1
167        )
168;
169
170
171
172/* Now process for the special SO3 uses - shouldn't be any - if some delete */
173select *
174from TMP_INDICATOR_VIEW_DATASET tivd
175where DATA_SOURCE_SORT_ORDER = 1
176  and INDICATOR_VIEW_NAME in 
177        (select tivd2.INDICATOR_VIEW_NAME
178         from TMP_INDICATOR_VIEW_DATASET tivd2
179         where tivd2.DATA_SOURCE_SORT_ORDER = 3
180           and tivd2.NUMERATOR_DATA_SOURCE_NAME = tivd.NUMERATOR_DATA_SOURCE_NAME
181        )
182order by INDICATOR_VIEW_NAME, NUMERATOR_DATA_SOURCE_NAME, DIMENSION_1_NAME,  DIMENSION_2_NAME
183;
184
185
186
187/* Finally, should not have any of these but test/remove all SO3's that only have 1 US SO1 */
188select *
189from TMP_INDICATOR_VIEW_DATASET tivd
190where tivd.DATA_SOURCE_SORT_ORDER = 3
191  and 1 = 
192        (select count(*) 
193         from TMP_INDICATOR_VIEW_DATASET t
194         where t.DATA_SOURCE_SORT_ORDER = 1
195           and t.INDICATOR_VIEW_NAME = tivd.INDICATOR_VIEW_NAME
196        )
197;
198
199/* code that can be modified if the above exists...
200delete
201from TMP_INDICATOR_VIEW_DATASET
202where DATA_SOURCE_SORT_ORDER = 3
203  and INDICATOR_VIEW_NAME = 
204        (select t.INDICATOR_VIEW_NAME from
205                (select INDICATOR_VIEW_NAME
206                 from TMP_INDICATOR_VIEW_DATASET
207                 where DATA_SOURCE_SORT_ORDER = 1
208                   and ((DIMENSION_1_NAME = 'GeoCountry') or (DIMENSION_2_NAME = 'GeoCountry'))
209                 group by INDICATOR_VIEW_NAME
210                 having count(INDICATOR_VIEW_NAME) > 1
211                ) t
212        )
213;
214*/
215
216
217/* set the measaure name, active flag, and update the IP name - just in case */
218update TMP_INDICATOR_VIEW_DATASET tmp
219        inner join 
220                (select NAME, INDICATOR_NAME, MEASURE_NAME, ACTIVE_FLAG from INDICATOR_VIEW) iv
221                on iv.NAME = tmp.INDICATOR_VIEW_NAME
222set tmp.INDICATOR_VIEW_ACTIVE_FLAG = iv.ACTIVE_FLAG,
223        tmp.MEASURE_NAME               = iv.MEASURE_NAME,
224        tmp.INDICATOR_NAME             = iv.INDICATOR_NAME
225;
226
227/* set the dataset name */
228update TMP_INDICATOR_VIEW_DATASET
229set DATASET_NAME = concat(INDICATOR_NAME, '.', MEASURE_NAME, '.', NUMERATOR_DATA_SOURCE_NAME)
230where DATASET_NAME is null
231;
232
233select count(*) from TMP_INDICATOR_VIEW_DATASET;
234select * from TMP_INDICATOR_VIEW_DATASET where DATASET_NAME is null;
235
236
237
238
239delete from DATASET;
240
241insert into DATASET (
242        NAME, INDICATOR_NAME, 
243        TITLE,
244        NUMERATOR, DENOMINATOR, DATA_NOTE, 
245        MEASURE_NAME, SORT_ORDER, ACTIVE_FLAG, MODIFIED_DATE)
246select 
247        tivd.DATASET_NAME, tivd.INDICATOR_NAME, 
248        concat('TITLE: ', tivd.MEASURE_NAME, ', ', tivd.NUMERATOR_DATA_SOURCE_NAME),
249        i.NUMERATOR, i.DENOMINATOR, substring(i.DATA_NOTE, 1, 1000), 
250        tivd.MEASURE_NAME, iv.SORT_ORDER, 'x', iv.MODIFIED_DATE
251from INDICATOR i, INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd
252where i.NAME  = tivd.INDICATOR_NAME
253  and iv.NAME = tivd.INDICATOR_VIEW_NAME
254group by tivd.DATASET_NAME
255;
256
257select INDICATOR_NAME, DATASET_NAME, count(DATASET_NAME)
258from TMP_INDICATOR_VIEW_DATASET
259group by DATASET_NAME, INDICATOR_NAME
260;
261
262/* update any null dataset data notes via the IPV */
263update DATASET ds set DATA_NOTE = 
264        (select iv.DATA_NOTE
265         from INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd
266         where iv.NAME = tivd.INDICATOR_VIEW_NAME
267           and tivd.DATASET_NAME = ds.NAME
268           and iv.DATA_NOTE is not null
269         limit 1
270        )
271where ds.DATA_NOTE is null;
272
273/* update the title and desc via the DS and MEASURE */
274update DATASET d set TITLE = 
275        ifnull( substr(
276                (select concat(ds.NAME, ', ', m.TITLE)
277                 from INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd, MEASURE m, DATA_SOURCE ds
278                 where iv.NAME = tivd.INDICATOR_VIEW_NAME
279                   and d.NAME  = tivd.DATASET_NAME
280                   and ds.NAME = tivd.NUMERATOR_DATA_SOURCE_NAME
281                   and m.NAME  = tivd.MEASURE_NAME
282                 limit 1
283                ), 1, 100)
284        , '')
285;
286update DATASET d set DESCRIPTION = 
287        (select concat('Dataset of ', m.TITLE, ' for ', ds.TITLE)
288         from INDICATOR_VIEW iv, TMP_INDICATOR_VIEW_DATASET tivd, MEASURE m, DATA_SOURCE ds
289         where iv.NAME = tivd.INDICATOR_VIEW_NAME
290           and d.NAME  = tivd.DATASET_NAME
291           and ds.NAME = tivd.NUMERATOR_DATA_SOURCE_NAME
292           and m.NAME  = tivd.MEASURE_NAME
293         limit 1
294        )
295;
296
297
298
299
300/* Insert the DATA_SOURCE_TO_DATASET records.
301        Indicator view to data source sort order which specifies 1=numerator vs 2=denomn 3=US
302        1 only:  1 = numer, 1 = denom
303        1, 2:    1 = numer, 2 = denom
304        1, 2, 3: 1 = NM numer, 2 = NM denom  ,,  3 = US numer, US denom
305        1, 3:    1 = NM numer, 1 = NM denom  ,,  3 = US numer, US denom
306        3 only:  3 = US numer, 3 = US denom
307*/
308/* The numerator data source in the temp table is ALWAYS the numerator DS. */
309delete from DATA_SOURCE_TO_DATASET;
310
311insert into DATA_SOURCE_TO_DATASET (DATASET_NAME, DATA_SOURCE_NAME, DATA_SOURCE_USAGE)
312select DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME, 'numerator'
313from TMP_INDICATOR_VIEW_DATASET
314group by DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME
315;
316
317/* As a general catch all set the denom datasource to the numerator DS.  This
318        is accurate for both only 1, or only 1&3 e.g. no SO2.
319*/
320insert into DATA_SOURCE_TO_DATASET (DATASET_NAME, DATA_SOURCE_NAME, DATA_SOURCE_USAGE)
321select DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME, 'denominator'
322from TMP_INDICATOR_VIEW_DATASET
323group by DATASET_NAME, NUMERATOR_DATA_SOURCE_NAME
324;
325
326/* Need to update the denominator dataset datasource usage when there is a
327        specific denom DS (sort order 2).   Note the use of ifnull to set it to
328        itself if no SO2 record exists (e.g. don't set it to null).
329
330        NOTES: the ds to IV has around 500 SO2's.  However, many of those IV's are
331        consolidated into fewer datasets.  So update of dataset to SO2 ==> 117
332        which is inline with the ratio of IVs to DSs.
333
334        The select returns approx 135 rows of SO2 for the assoc IPs.  The update
335        results in approx 150 rows updated.
336*/
337select *
338from DATA_SOURCE_TO_INDICATOR_VIEW
339where SORT_ORDER = 2
340group by substring_index(INDICATOR_VIEW_NAME, '.', 1), DATA_SOURCE_NAME
341;
342
343update DATA_SOURCE_TO_DATASET dstd
344set DATA_SOURCE_NAME = ifnull(
345        (select dstiv.DATA_SOURCE_NAME
346         from DATA_SOURCE_TO_INDICATOR_VIEW dstiv, TMP_INDICATOR_VIEW_DATASET tivd
347         where tivd.DATASET_NAME = dstd.DATASET_NAME
348           and tivd.DATA_SOURCE_SORT_ORDER = 1
349           and dstiv.INDICATOR_VIEW_NAME = tivd.INDICATOR_VIEW_NAME
350           and dstiv.SORT_ORDER = 2
351         group by tivd.DATASET_NAME, dstiv.DATA_SOURCE_NAME
352         limit 1
353        ), DATA_SOURCE_NAME)  /* use 'garth' to test which ones do NOT have a SO of 2 of the 322 exactly 1/2 do not.... */
354where DATA_SOURCE_USAGE = 'denominator'
355;
356
357
358
359
360/* DATASET to IV */
361delete from DATASET_TO_INDICATOR_VIEW;
362
363insert into DATASET_TO_INDICATOR_VIEW(DATASET_NAME, INDICATOR_VIEW_NAME)
364select DATASET_NAME, INDICATOR_VIEW_NAME
365from TMP_INDICATOR_VIEW_DATASET
366where DATASET_NAME        is not null
367  and INDICATOR_VIEW_NAME is not null
368group by INDICATOR_VIEW_NAME, DATASET_NAME
369;
370
371
372
373/* create the DIMENSION to DATASET records based on the dataset's records.  Prior
374        to 10/5/16 these records were based on the view.
375       
376        5/5/2017, changed logic to use the tmp table and not join on the IVV.
377*/
378delete from DIMENSION_TO_DATASET;
379
380insert into DIMENSION_TO_DATASET(DATASET_NAME, DIMENSION_NAME, DIMENSION_USAGE)
381select tivd.DATASET_NAME, tivd.PERIOD_DIMENSION_NAME, 'period' 
382from TMP_INDICATOR_VIEW_DATASET tivd
383group by 1, 2
384;
385
386insert into DIMENSION_TO_DATASET(DATASET_NAME, DIMENSION_NAME, DIMENSION_USAGE)
387select tivd.DATASET_NAME, tivd.DIMENSION_1_NAME, 'dimension1' 
388from TMP_INDICATOR_VIEW_DATASET tivd
389where tivd.DIMENSION_1_NAME is not null
390group by 1, 2
391;
392
393insert into DIMENSION_TO_DATASET(DATASET_NAME, DIMENSION_NAME, DIMENSION_USAGE)
394select tivd.DATASET_NAME, tivd.DIMENSION_2_NAME, 'dimension2' 
395from TMP_INDICATOR_VIEW_DATASET tivd
396where tivd.DIMENSION_2_NAME is not null
397  and not exists 
398         (select 1
399           from DIMENSION_TO_DATASET dtd2
400           where dtd2.DIMENSION_USAGE = 'dimension1'
401                 and dtd2.DIMENSION_NAME  = tivd.DIMENSION_2_NAME
402                 and dtd2.DATASET_NAME    = tivd.DATASET_NAME
403         )
404group by 1, 2
405;
406
407
408
409/* check - make sure no dupicate dim1/dim2 usages. */
410select * from DIMENSION_TO_DATASET dtd1
411where dtd1.DIMENSION_USAGE = 'dimension2'
412  and exists 
413         (select 1
414           from DIMENSION_TO_DATASET dtd2
415           where dtd2.DIMENSION_USAGE = 'dimension1'
416                 and dtd2.DIMENSION_NAME  = dtd1.DIMENSION_NAME
417                 and dtd2.DATASET_NAME    = dtd1.DATASET_NAME
418         )
419;
420
421delete from DIMENSION_TO_DATASET
422where DIMENSION_USAGE = 'dimension2'
423  and (DATASET_NAME, DIMENSION_NAME) in 
424        (select dtd.DATASET_NAME, dtd.DIMENSION_NAME
425         from (select DATASET_NAME, DIMENSION_NAME
426                   from DIMENSION_TO_DATASET
427                   where DIMENSION_USAGE = 'dimension1'
428                  ) as dtd
429        )
430;
431
432
433/* clean out the NA dims */
434delete from DIMENSION_TO_DATASET where DIMENSION_NAME = 'NA';
435
436
437
438/* Set the dataset's DATA_ISSUE to the associated IP.DATA_NOTE field.  This is
439        done via the DATASOURCE_TO_IV and the IP.DEFAULT_INDICATOR_VIEW_NAME
440        and the SORT_ORDER being 1 (which is the numerator data source).
441*/
442update DATA_SOURCE ds
443set ds.DATA_ISSUES = 
444        (select i.DATA_NOTE
445         from INDICATOR i
446         where i.DATA_NOTE is not null
447           and i.DEFAULT_INDICATOR_VIEW_NAME = 
448                (select dstiv.INDICATOR_VIEW_NAME
449                 from DATA_SOURCE_TO_INDICATOR_VIEW dstiv
450                 where dstiv.DATA_SOURCE_NAME = ds.NAME
451                   and dstiv.SORT_ORDER = 1
452                 group by dstiv.DATA_SOURCE_NAME, dstiv.INDICATOR_VIEW_NAME
453                 limit 1
454                )
455        ) 
456where ds.DATA_ISSUES is null
457;
458
459update DATA_SOURCE ds
460set ds.DATA_ISSUES = 
461        (select i.DATA_NOTE
462         from INDICATOR i
463         where i.DATA_NOTE is not null
464           and i.DEFAULT_INDICATOR_VIEW_NAME = 
465                (select dstiv.INDICATOR_VIEW_NAME
466                 from DATA_SOURCE_TO_INDICATOR_VIEW dstiv
467                 where dstiv.DATA_SOURCE_NAME = ds.NAME
468                 group by dstiv.DATA_SOURCE_NAME, dstiv.INDICATOR_VIEW_NAME
469                 limit 1
470                )
471        ) 
472where ds.DATA_ISSUES is null
473;
474
475update DATA_SOURCE ds
476set ds.DATA_ISSUES = 
477        (select i.DATA_NOTE
478         from INDICATOR i, INDICATOR_VIEW iv, DATA_SOURCE_TO_INDICATOR_VIEW dstiv
479         where i.DATA_NOTE is not null
480           and i.NAME = iv.INDICATOR_NAME
481           and iv.NAME = dstiv.INDICATOR_VIEW_NAME
482           and dstiv.DATA_SOURCE_NAME = ds.NAME
483         group by dstiv.DATA_SOURCE_NAME, dstiv.INDICATOR_VIEW_NAME
484         limit 1
485        ) 
486where ds.DATA_ISSUES is null
487;
488/*------------------------------ END OF FILE ------------------------------*/
Note: See TracBrowser for help on using the repository browser.