source: main/trunk/db/src/main/scripts/mysql/migrate/11_data-dataset_record.sql @ 25198

Last change on this file since 25198 was 25198, checked in by GarthBraithwaite_STG, 3 months ago

db - updated the ds create to key off of SO3 and "Country" and cleaned out dup SO1 and SO3 usage. Added map_name cleanup to set to None vs DEFAULT. Added more select tests. Added test for VALUE_ATTR and null measure value.

File size: 25.5 KB
Line 
1/* Create the dataset value records staight from the IVV records which were all
2        cleaned up in the data-ivv-dimensions.sql code.
3*/
4select count(*) from INDICATOR_VIEW_VALUE;
5select concat(INDICATOR_VIEW_NAME, 
6                        '|', DIMENSION_1_NAME, ':', DIMENSION_1_VALUE,
7                        '|', DIMENSION_2_NAME, ':', DIMENSION_2_VALUE,
8                        '|', PERIOD_DIMENSION_NAME, ':', PERIOD_DIMENSION_VALUE
9                ), count(1),
10        INDICATOR_VIEW_NAME, DATASET_CATEGORY_VALUE, DATASET_SERIES_VALUE,
11        DIMENSION_1_NAME,  DIMENSION_2_NAME,  PERIOD_DIMENSION_NAME, 
12        DIMENSION_1_VALUE, DIMENSION_2_VALUE, PERIOD_DIMENSION_VALUE, 
13        VALUE
14from INDICATOR_VIEW_VALUE
15where INDICATOR_VIEW_NAME not like 'Demo%'
16  and ACTIVE_FLAG = 'x'
17group by 1
18having count(1) > 1
19order by 2 desc;
20
21
22
23drop table if exists TMP_DATASET_RECORD;
24
25create table TMP_DATASET_RECORD(
26  INDICATOR_VIEW_NAME             varchar  (400),
27  INDICATOR_NAME                  varchar  (400),
28  DATASET_NAME                    varchar  (200), 
29  MEASURE_NAME                    varchar  (100), 
30  /* */
31  PERIOD_DIMENSION_NAME           varchar  (100)  not null,
32  PERIOD_DIMENSION_VALUE          varchar  (100)  not null,
33  DIMENSION_1_NAME                varchar  (100)  not null,
34  DIMENSION_1_VALUE               varchar  (100)  not null,
35  DIMENSION_2_NAME                varchar  (100),
36  DIMENSION_2_VALUE               varchar  (100),
37  /* */
38  NUMERATOR                       double,
39  DENOMINATOR                     double,
40  VALUE                           double,               /* can be null because if insufficient etc. 0 is not correct and * or ** is NOT allowed! */
41  LOWER_CONFIDENCE_VALUE          double,
42  UPPER_CONFIDENCE_VALUE          double,
43  LABEL                           varchar  (100),
44  NOTE                            varchar (2000),
45  /* */
46  SORT_ORDER                      int             default null,
47  ACTIVE_FLAG                     varchar    (1),
48  /* */
49  constraint TMP_DATASET_RECORD_UK
50    unique key (INDICATOR_VIEW_NAME, PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE,
51                DIMENSION_1_NAME, DIMENSION_1_VALUE, DIMENSION_2_NAME, DIMENSION_2_VALUE
52        )
53);
54
55
56/* do a straight grab - dump in all the values.  Note have to do a temp table
57        insert that does not have a composite PK because the dataset name is more
58        general as it combines geocnty, georegion, geosarea etc and thus has the
59        same multiple state records which will cause dup PK issues.
60
61        first need to set the IV values to numeric (or null if blank) so that the
62        value can be loaded into the numeric only value DATASET_RECORD.  These
63        values are later used to set the value attribute.
64        Note that the admin app only allows **, null, '', and numeric values. 
65        The -1 or * should never happen but there are records that have a label
66       
67        -1 == ** == suppressed.
68        -20 == '' or null == missing/no data.
69        -99 == numeric * 1 != - catch everything else - should never happen.
70        IBISQ allows for *, **, NA?, and ''
71*/
72delete from TMP_DATASET_RECORD;
73
74insert into TMP_DATASET_RECORD (
75        INDICATOR_VIEW_NAME, 
76        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE, 
77        DIMENSION_1_NAME, DIMENSION_1_VALUE, 
78        DIMENSION_2_NAME, DIMENSION_2_VALUE, 
79        VALUE, LOWER_CONFIDENCE_VALUE, UPPER_CONFIDENCE_VALUE, NUMERATOR, DENOMINATOR, 
80        LABEL, NOTE, ACTIVE_FLAG)
81select 
82        INDICATOR_VIEW_NAME,
83        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE, 
84        DIMENSION_1_NAME, DIMENSION_1_VALUE,
85        DIMENSION_2_NAME, DIMENSION_2_VALUE, 
86        case 
87                when (VALUE like '%*%') and (LABEL like '%uppress%small%') then  -1 
88                when (VALUE like '%*%') and (LABEL like '%uppress%')       then  -1 
89                when (VALUE like '%*%') and (LABEL like '%ery%nstable%')   then -11 
90                when (VALUE like '%*%') and (LABEL like '%nstable%')       then -10 
91                when (VALUE like '%*%') and (LABEL like '%not reliable%')  then -10 
92                when (VALUE like '%*%') and (LABEL like '%ot%vail%')       then -20 
93                when (VALUE like '%*%') and (LABEL like 'NA')              then -20 
94                when (VALUE like '%*%') and (LABEL like '%o%ata%')         then -20 
95                when (VALUE like '%*%') and (LABEL like '%ata%ot%')        then -20 
96                when (VALUE like '%*%') and (LABEL like '%nsuff%')         then -21
97                when (VALUE like '%*%') and (LABEL like '%ncomplete%')     then -21
98                when (VALUE like '%*%')                                    then  -1 
99                when (VALUE = '')                                          then -20
100                when (VALUE*1 != VALUE)                                    then -99
101                else  VALUE
102        end,
103        case 
104                when (LOWER_CONFIDENCE_VALUE = '**') then -1 
105                when (LOWER_CONFIDENCE_VALUE = '')   then -20
106                when (LOWER_CONFIDENCE_VALUE*1 != LOWER_CONFIDENCE_VALUE) then -99
107                else  LOWER_CONFIDENCE_VALUE
108        end,
109        case 
110                when (UPPER_CONFIDENCE_VALUE = '**') then -1 
111                when (UPPER_CONFIDENCE_VALUE = '')   then -20
112                when (UPPER_CONFIDENCE_VALUE*1 != UPPER_CONFIDENCE_VALUE) then -99
113                else  UPPER_CONFIDENCE_VALUE
114        end,
115        case 
116                when (NUMERATOR = '**') then -1 
117                when (NUMERATOR = '')   then -20
118                when (NUMERATOR*1 != NUMERATOR) then -99
119                else  NUMERATOR*1
120        end,
121        case 
122                when (DENOMINATOR = '**') then -1 
123                when (DENOMINATOR = '')   then -20
124                when (DENOMINATOR*1 != DENOMINATOR) then -99
125                else  DENOMINATOR*1
126        end,
127        LABEL, NOTE, ACTIVE_FLAG
128from INDICATOR_VIEW_VALUE
129;
130
131
132/* Blanket/general temp record's dataset name update.  Match the IV name and
133        dimension names against the TMP_INDICATOR_VIEW_DATASET's values to get the
134        tivd dataset name.
135*/
136update TMP_DATASET_RECORD tdr
137        inner join 
138        (select INDICATOR_VIEW_NAME, INDICATOR_NAME, MEASURE_NAME, DIMENSION_1_NAME, DIMENSION_2_NAME, PERIOD_DIMENSION_NAME, DATASET_NAME
139         from TMP_INDICATOR_VIEW_DATASET
140        ) tivd_ij
141        on ( 
142                (tivd_ij.INDICATOR_VIEW_NAME   = tdr.INDICATOR_VIEW_NAME)
143                 and ( tivd_ij.DIMENSION_1_NAME  = tdr.DIMENSION_1_NAME)
144                 and ((tivd_ij.DIMENSION_2_NAME  = tdr.DIMENSION_2_NAME) or (tdr.DIMENSION_2_NAME is null))
145                 and ( tivd_ij.PERIOD_DIMENSION_NAME = tdr.PERIOD_DIMENSION_NAME)
146        )
147set tdr.DATASET_NAME   = tivd_ij.DATASET_NAME, 
148        tdr.MEASURE_NAME   = tivd_ij.MEASURE_NAME,
149        tdr.INDICATOR_NAME = tivd_ij.INDICATOR_NAME
150where tdr.DATASET_NAME is null
151;
152
153
154
155/* should never have these */
156select * from TMP_DATASET_RECORD tdr
157where tdr.DATASET_NAME is not null
158and not exists
159        (select INDICATOR_VIEW_NAME
160         from TMP_INDICATOR_VIEW_DATASET ij
161         where (ij.INDICATOR_VIEW_NAME   = tdr.INDICATOR_VIEW_NAME)
162        )
163group by INDICATOR_VIEW_NAME
164;
165
166/* check DSRs that don't have a dataset. */
167select * from TMP_DATASET_RECORD where DATASET_NAME is null;
168
169/* IVV names that don't match */
170select distinct INDICATOR_VIEW_NAME
171from TMP_DATASET_RECORD
172where DATASET_NAME is null
173  and INDICATOR_VIEW_NAME not in (select INDICATOR_VIEW_NAME from TMP_INDICATOR_VIEW_DATASET)
174;
175
176/* These appear to be orphans.  Not sure how they got this far as data-cleanup.sql */
177delete from TMP_DATASET_RECORD
178where DATASET_NAME is null
179  and INDICATOR_VIEW_NAME not in (select INDICATOR_VIEW_NAME from TMP_INDICATOR_VIEW_DATASET)
180;
181
182
183/* the above issues seem to be because there's not SO1 DS for the dataset name */
184select * from DATA_SOURCE_TO_INDICATOR_VIEW
185where INDICATOR_VIEW_NAME in 
186        (select distinct INDICATOR_VIEW_NAME
187         from TMP_DATASET_RECORD
188         where DATASET_NAME is null
189           and INDICATOR_VIEW_NAME not in 
190                (select INDICATOR_VIEW_NAME from TMP_INDICATOR_VIEW_DATASET)
191        )
192;
193
194
195/* Should not have these.  IF FOUND need to be researched as to why - see above.
196        3/11/2018 run has 40 with most being PopDemo and all being State, US.  The
197        current issue is NOT the State, US it's that for these the first IVV and/or
198        IV are being used to determine the DIM1 and DIM2.  In these rare cases the
199        dim usage is switched e.g. the first view uses RaceEth then GeoState.  All
200        view uses after that are using GeoState first and no DIM2 etc.  So those
201        records will have to be manuall corrected...
202*/
203select * from TMP_DATASET_RECORD tdr
204where tdr.DATASET_NAME is null
205and exists
206        (select INDICATOR_VIEW_NAME
207         from TMP_INDICATOR_VIEW_DATASET ij
208         where (ij.INDICATOR_VIEW_NAME   = tdr.INDICATOR_VIEW_NAME)
209        )
210group by INDICATOR_VIEW_NAME
211;
212
213
214/* check the IP names not being found.  Since turning off the active flag
215        restriction there should not be any BUT for the temp NM run maybe the
216        cleanup didn't get em all?
217*/
218select * from TMP_DATASET_RECORD tdr
219where tdr.DATASET_NAME is null
220and exists
221        (select INDICATOR_VIEW_NAME
222         from TMP_INDICATOR_VIEW_DATASET ij
223         where (ij.INDICATOR_VIEW_NAME   = tdr.INDICATOR_VIEW_NAME)
224        )
225group by INDICATOR_VIEW_NAME
226;
227
228
229select * from TMP_DATASET_RECORD tdr
230where tdr.DATASET_NAME is null
231and exists
232        (select INDICATOR_VIEW_NAME, DIMENSION_1_NAME, DIMENSION_2_NAME, PERIOD_DIMENSION_NAME, DATASET_NAME
233         from TMP_INDICATOR_VIEW_DATASET ij
234         where (ij.INDICATOR_VIEW_NAME   = tdr.INDICATOR_VIEW_NAME)
235           and (ij.DIMENSION_1_NAME  = tdr.DIMENSION_1_NAME)
236           and ((ij.DIMENSION_2_NAME = tdr.DIMENSION_2_NAME) or (tdr.DIMENSION_2_NAME is null))
237           and (ij.PERIOD_DIMENSION_NAME = tdr.PERIOD_DIMENSION_NAME)
238        )
239;
240
241
242
243/* now insert the unique temp records into the actual DATASET_RECORD table */
244select * from DATASET_RECORD;
245delete from DATASET_RECORD;
246
247insert into DATASET_RECORD (
248        NAME, DATASET_NAME, 
249        PERIOD_DIMENSION_NAME,  DIMENSION_1_NAME,  DIMENSION_2_NAME, 
250        PERIOD_DIMENSION_VALUE, DIMENSION_1_VALUE, DIMENSION_2_VALUE,
251        MEASURE_VALUE, 
252        VALUE_ATTRIBUTE_NAME,
253        LABEL, NOTE, ACTIVE_FLAG
254)
255select 
256        concat(DATASET_NAME, 
257                '|', PERIOD_DIMENSION_NAME,   ':', PERIOD_DIMENSION_VALUE,
258                '|', DIMENSION_1_NAME, ':', DIMENSION_1_VALUE,
259                '|', ifnull(DIMENSION_2_NAME, 'NA'), ':', ifnull(DIMENSION_2_VALUE, 'NA')
260        ),
261        ifnull(DATASET_NAME, 'garth'),
262        PERIOD_DIMENSION_NAME,  DIMENSION_1_NAME,  DIMENSION_2_NAME, 
263        PERIOD_DIMENSION_VALUE, DIMENSION_1_VALUE, DIMENSION_2_VALUE, 
264        case 
265                when (VALUE < 0) then null 
266                else VALUE
267        end,
268        case 
269                when (VALUE =  -1) then 'Suppressed'
270                when (VALUE =  -2) then 'SurveySuppressed'
271                when (VALUE = -10) then 'Unstable'
272                when (VALUE = -11) then 'VeryUnstable'
273                when (VALUE = -20) then 'NoData'
274                when (VALUE = -21) then 'Insufficient'
275                when (VALUE <   0) then 'ERROR'
276                else null
277        end,
278        LABEL, NOTE, ACTIVE_FLAG
279from TMP_DATASET_RECORD
280where DATASET_NAME is not null
281group by 1
282;
283
284
285/* update the VA based on the label and/or the special values */
286update DATASET_RECORD
287set VALUE_ATTRIBUTE_NAME = 'ERROR'
288where (VALUE_ATTRIBUTE_NAME is null)
289  and (MEASURE_VALUE = -9)
290;
291
292
293select * from DATASET_RECORD
294where VALUE_ATTRIBUTE_NAME = 'ERROR'
295;
296
297
298
299
300/* Remove redundant series or cat dim values that match/are period dim values.
301if dimension 1 is the same as period then put dim 2 into dim 1
302update DATASET_RECORD set DIMENSION_1_VALUE = '!~~~!'           where ((DIMENSION_1_NAME = PERIOD_DIMENSION_NAME) and (DIMENSION_1_VALUE != PERIOD_DIMENSION_VALUE));
303update DATASET_RECORD set DIMENSION_2_VALUE = '!~~~!' where ((DIMENSION_2_NAME = PERIOD_DIMENSION_NAME) and (DIMENSION_2_VALUE != PERIOD_DIMENSION_VALUE));
304*/
305select * from DATASET_RECORD
306where ((DIMENSION_1_NAME = PERIOD_DIMENSION_NAME) and (DIMENSION_1_VALUE != PERIOD_DIMENSION_VALUE))
307   or ((DIMENSION_2_NAME = PERIOD_DIMENSION_NAME) and (DIMENSION_2_VALUE != PERIOD_DIMENSION_VALUE))
308;
309select * from DATASET_RECORD
310where ((DIMENSION_1_NAME = PERIOD_DIMENSION_NAME) and (DIMENSION_1_VALUE = PERIOD_DIMENSION_VALUE))
311   or ((DIMENSION_2_NAME = PERIOD_DIMENSION_NAME) and (DIMENSION_2_VALUE = PERIOD_DIMENSION_VALUE))
312;
313
314
315/* did have it matching on the VALUE as well but there's no value here.  If the
316        dimension name matches the period then get rid of it - period should alreay
317        be correct from previous updates.
318update DATASET_RECORD set DIMENSION_1_VALUE = DIMENSION_2_VALUE where (DIMENSION_1_NAME = PERIOD_DIMENSION_NAME);
319update DATASET_RECORD set DIMENSION_1_NAME  = DIMENSION_2_NAME  where (DIMENSION_1_NAME = PERIOD_DIMENSION_NAME);
320
321update DATASET_RECORD set DIMENSION_2_VALUE = ''     where DIMENSION_1_NAME = DIMENSION_2_NAME;
322update DATASET_RECORD set DIMENSION_2_NAME  = 'None' where DIMENSION_1_NAME = DIMENSION_2_NAME;
323
324update DATASET_RECORD set DIMENSION_2_VALUE = ''     where (DIMENSION_2_NAME = PERIOD_DIMENSION_NAME);
325update DATASET_RECORD set DIMENSION_2_NAME  = 'None' where (DIMENSION_2_NAME = PERIOD_DIMENSION_NAME);
326*/
327
328
329/* Seeing issues with duplicate records.  Show Lois so she can determine the
330        datasource issue.  This query takes a while to run...
331
332select *
333from TMP_DATASET_RECORD t1
334where exists
335        (select 1
336         from TMP_DATASET_RECORD t2
337     where  t1.INDICATOR_NAME = t2.INDICATOR_NAME
338                and t1.DATASET_NAME != t2.DATASET_NAME
339                and t1.MEASURE_NAME = t2.MEASURE_NAME
340                and t1.PERIOD_DIMENSION_NAME = t2.PERIOD_DIMENSION_NAME
341        and t1.PERIOD_DIMENSION_VALUE = t2.PERIOD_DIMENSION_VALUE
342        and t1.DIMENSION_1_NAME = t2.DIMENSION_1_NAME
343        and t1.DIMENSION_1_VALUE = t2.DIMENSION_1_VALUE
344        and ((t1.DIMENSION_2_NAME = t2.DIMENSION_2_NAME)
345                        or
346                        (t1.DIMENSION_2_NAME is null and t2.DIMENSION_2_NAME is null))
347        and ((t1.DIMENSION_2_VALUE = t2.DIMENSION_2_VALUE)
348                        or
349                        (t1.DIMENSION_2_VALUE is null and t2.DIMENSION_2_VALUE is null))
350                and t1.VALUE = t2.VALUE
351         limit 1, 1
352        )
353order by t1.DATASET_NAME
354;
355*/
356
357
358
359/* Shouldn't see any of these */
360select * 
361from DATASET_RECORD t1
362where exists 
363        (select 1
364         from DATASET_RECORD t2
365     where  t1.DATASET_NAME = t2.DATASET_NAME
366                and t1.PERIOD_DIMENSION_NAME = t2.PERIOD_DIMENSION_NAME
367        and t1.PERIOD_DIMENSION_VALUE = t2.PERIOD_DIMENSION_VALUE
368        and t1.DIMENSION_1_NAME = t2.DIMENSION_1_NAME
369        and t1.DIMENSION_1_VALUE = t2.DIMENSION_1_VALUE
370        and ((t1.DIMENSION_2_NAME = t2.DIMENSION_2_NAME) 
371                        or 
372                        (t1.DIMENSION_2_NAME is null and t2.DIMENSION_2_NAME is null))
373        and ((t1.DIMENSION_2_VALUE = t2.DIMENSION_2_VALUE) 
374                        or 
375                        (t1.DIMENSION_2_VALUE is null and t2.DIMENSION_2_VALUE is null))
376                and t1.MEASURE_VALUE = t2.MEASURE_VALUE
377         limit 1, 1
378        )
379order by t1.DATASET_NAME
380;
381
382
383/* These are likely dups or at least dups due to data source ip.mn.dsn */
384select * 
385from DATASET_RECORD t1
386where exists 
387        (select 1
388         from DATASET_RECORD t2
389     where  t1.DATASET_NAME = t2.DATASET_NAME
390                and t1.PERIOD_DIMENSION_NAME = t2.PERIOD_DIMENSION_NAME
391        and t1.PERIOD_DIMENSION_VALUE = t2.PERIOD_DIMENSION_VALUE
392        and t1.DIMENSION_1_NAME = t2.DIMENSION_1_NAME
393        and t1.DIMENSION_1_VALUE = t2.DIMENSION_1_VALUE
394        and ((t1.DIMENSION_2_NAME = t2.DIMENSION_2_NAME) 
395                        or 
396                        (t1.DIMENSION_2_NAME is null and t2.DIMENSION_2_NAME is null))
397        and ((t1.DIMENSION_2_VALUE = t2.DIMENSION_2_VALUE) 
398                        or 
399                        (t1.DIMENSION_2_VALUE is null and t2.DIMENSION_2_VALUE is null))
400                and t1.MEASURE_VALUE = t2.MEASURE_VALUE
401         limit 1, 1
402        )
403order by t1.DATASET_NAME
404;
405
406
407/*
408stuff added 7/27/2019 to clean out usages
409now that we have dataset records, clean out invalid dim usages
410*/
411
412/* Delete DSRs that do NOT have a measure value and blank/null label.  There's
413        a bunch of blank measure values that have a blank label and have a "suppresed"
414        or No Data value attribute set.
415*/
416select concat('*', LABEL, '*'), VALUE_ATTRIBUTE_NAME, count(*) c
417from DATASET_RECORD
418where MEASURE_VALUE is null
419  and ((LABEL is null) or (LABEL <= " "))
420  and ((VALUE_ATTRIBUTE_NAME is null) or (VALUE_ATTRIBUTE_NAME < " "))
421group by LABEL, VALUE_ATTRIBUTE_NAME
422;
423
424delete from DATASET_RECORD
425where MEASURE_VALUE is null
426  and ((LABEL is null) or (LABEL <= " "))
427  and ((VALUE_ATTRIBUTE_NAME is null) or (VALUE_ATTRIBUTE_NAME < " "))
428;
429
430delete from DATASET_RECORD
431where MEASURE_VALUE is null
432  and LABEL = '**'
433  and VALUE_ATTRIBUTE_NAME is null
434;
435
436
437
438
439/* see all DTD records that do not have any DS records. */
440select * from DIMENSION_TO_DATASET dtd
441where dtd.DIMENSION_USAGE = 'dimension1'
442  and not exists 
443        (select 1 from DATASET_RECORD dr
444         where dr.DATASET_NAME = dtd.DATASET_NAME
445           and dr.DIMENSION_1_NAME = dtd.DIMENSION_NAME
446        )
447  and DIMENSION_NAME like 'Geo%'
448order by  dtd.DATASET_NAME,  dtd.DIMENSION_NAME
449;
450
451delete from DIMENSION_TO_DATASET
452where DIMENSION_USAGE = 'dimension1'
453  and DIMENSION_NAME like 'Geo%'
454  and not exists 
455        (select 1 from DATASET_RECORD dr
456         where dr.DATASET_NAME = DIMENSION_TO_DATASET.DATASET_NAME
457           and dr.DIMENSION_1_NAME = DIMENSION_TO_DATASET.DIMENSION_NAME
458        )
459;
460
461
462select * from DIMENSION_TO_DATASET dtd
463where dtd.DIMENSION_USAGE = 'dimension2'
464  and not exists 
465        (select 1 from DATASET_RECORD dr
466         where dr.DATASET_NAME = dtd.DATASET_NAME
467           and dr.DIMENSION_2_NAME = dtd.DIMENSION_NAME
468        )
469order by  dtd.DATASET_NAME,  dtd.DIMENSION_NAME
470;
471
472delete from DIMENSION_TO_DATASET
473where DIMENSION_USAGE = 'dimension2'
474  and DIMENSION_NAME like 'Geo%'
475  and not exists 
476        (select 1 from DATASET_RECORD dr
477         where dr.DATASET_NAME = DIMENSION_TO_DATASET.DATASET_NAME
478           and dr.DIMENSION_2_NAME = DIMENSION_TO_DATASET.DIMENSION_NAME
479        )
480;
481
482
483/*
484Still have conflicting dim usage where 2 datasets have same dim homicide example:
485both have legit State Measure Values - just different period dimension!!!
486has both state and country values by year.
487
488select * from
489dataset_record
490where dataset_name = 'Homicide.RateAADths100K.CDC_WONDER'   
491
492select * from
493dataset_record
494where dataset_name = 'Homicide.RateAADths100K.BVRHS_Death'
495*/ 
496
497
498
499/* Some Dataset Records have dim1 & 2 as a geo.  Script should be checked to see
500        if this can be correct.  Most are State and Country.  Other state and country
501        records worked so this  might have to be done manually.  For now put a prefix
502        on the records so that they are easily seen.
503*/
504select * from DATASET_RECORD
505where  DIMENSION_1_NAME like 'Geo%'
506  and DIMENSION_2_NAME like 'Geo%'
507;
508
509
510
511
512/* Fix dual usage where the script puts the dual used dimension as dim1.  When
513        dual usage Sex or RaceEthDOH etc is used as both a dim1 (when doing straight
514        chart, or as dim2 when doing a 2d chart like GeoCnty by Sex or Race). 
515
516        Since this is only seen on dim 1 (because the script populates it that way)
517        set the usage to dim 1 or 2 for those that have a dual use. 
518        There's about 45 of these records that need updating.  These associate with
519        thousands of actual dataset records...
520
521        For testing added this to the where:
522          and DATASET_NAME = 'AlcoholDrinkDriveAdult.PctDrinkDrive.BRFSS'
523
524        Can do a before update and after to see the dimension1or2 value change.
525        select * from DIMENSION_TO_DATASET;
526        where DATASET_NAME = 'AlcoholDrinkDriveAdult.PctDrinkDrive.BRFSS'
527*/
528select * from DIMENSION_TO_DATASET dtd
529where DIMENSION_USAGE = 'dimension1'
530  and DIMENSION_NAME in   
531        (select dr1.DIMENSION_1_NAME
532         from DATASET_RECORD dr1
533         where dr1.DATASET_NAME = dtd.DATASET_NAME
534           and dr1.DIMENSION_1_NAME in (select DIMENSION_2_NAME from DATASET_RECORD dr2 where dr2.DATASET_NAME = dr1.DATASET_NAME)
535        )
536;
537
538update DIMENSION_TO_DATASET dtd
539set DIMENSION_USAGE = 'dimension1or2'
540where DIMENSION_USAGE = 'dimension1'
541  and DIMENSION_NAME in   
542        (select dr1.DIMENSION_1_NAME
543         from DATASET_RECORD dr1
544         where dr1.DATASET_NAME = dtd.DATASET_NAME
545           and dr1.DIMENSION_1_NAME in (select DIMENSION_2_NAME from DATASET_RECORD dr2 where dr2.DATASET_NAME = dr1.DATASET_NAME)
546        )
547;
548
549
550/* Here's some tests that show the dataset records that are associated with this.
551        First do the dim1's that have a matching dim2 (usually the most used).
552
553        Since these are the DSR, the view and DSRs are already setup correctly.
554        These queries are simply used to show the dual use.
555*/
556select count(*)
557from DATASET_RECORD dr1
558where dr1.DIMENSION_2_NAME in (select DIMENSION_1_NAME from DATASET_RECORD dr2 where dr2.DATASET_NAME = dr1.DATASET_NAME)
559order by dr1.DATASET_NAME, dr1.DIMENSION_1_NAME, dr1.DIMENSION_1_VALUE, dr1.DIMENSION_2_NAME, dr1.DIMENSION_2_VALUE
560;
561
562select count(*)
563from DATASET_RECORD dr1
564where dr1.DIMENSION_1_NAME in (select DIMENSION_2_NAME from DATASET_RECORD dr2 where dr2.DATASET_NAME = dr1.DATASET_NAME)
565order by dr1.DATASET_NAME, dr1.DIMENSION_1_NAME, dr1.DIMENSION_1_VALUE, dr1.DIMENSION_2_NAME, dr1.DIMENSION_2_VALUE
566;
567
568
569
570/* DIMENSION_1_OR_2 issues to be aware of.  Of the dual usages, some have
571        multiples.  For NM; of the mults Homicide had some series switched/out of
572        sync.  Did not dig into it but could be missing values and the way the
573        script loops through and populates the dims for these records?
574       
575        In anycase something to be aware of watch for.
576*/
577select count(*) from DIMENSION_TO_DATASET
578where DIMENSION_USAGE = 'dimension1or2';
579
580select * from DIMENSION_TO_DATASET
581where DIMENSION_USAGE = 'dimension1or2';
582/*
583  and DATASET_NAME in
584        (select DATASET_NAME
585         from DIMENSION_TO_DATASET
586     where DIMENSION_USAGE = 'dimension1or2'
587         group by DATASET_NAME
588         having count(*) > 1
589        )
590;
591*/
592
593
594/* even indexed this takes forever to run
595select * from INDICATOR_VIEW_VALUE ivv
596where not exists
597        (
598        select 1
599        from TMP_DATASET_RECORD tdr
600        where tdr.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
601          and tdr.DIMENSION_1_NAME  = ivv.DIMENSION_1_NAME
602          and tdr.DIMENSION_1_VALUE = ivv.DIMENSION_1_VALUE
603          and tdr.DIMENSION_2_NAME  = ivv.DIMENSION_2_NAME
604          and tdr.DIMENSION_2_VALUE = ivv.DIMENSION_2_VALUE
605          and tdr.PERIOD_DIMENSION_NAME  = ivv.PERIOD_DIMENSION_NAME
606          and tdr.PERIOD_DIMENSION_VALUE = ivv.PERIOD_DIMENSION_VALUE
607        )
608order by INDICATOR_VIEW_NAME
609limit 100
610;
611*/
612
613
614/* this does the same thing but MySQL doesn't support minus
615select INDICATOR_VIEW_NAME,
616        DIMENSION_1_NAME, DIMENSION_1_VALUE,
617        DIMENSION_2_NAME, DIMENSION_2_VALUE,
618        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE
619from INDICATOR_VIEW_VALUE
620minus
621select ivv.INDICATOR_VIEW_NAME,
622        ivv.DIMENSION_1_NAME, ivv.DIMENSION_1_VALUE,
623        ivv.DIMENSION_2_NAME, ivv.DIMENSION_2_VALUE,
624        ivv.PERIOD_DIMENSION_NAME, ivv.PERIOD_DIMENSION_VALUE
625from INDICATOR_VIEW_VALUE ivv, TMP_DATASET_RECORD tdr
626where tdr.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
627  and tdr.DIMENSION_1_NAME  = ivv.DIMENSION_1_NAME
628  and tdr.DIMENSION_1_VALUE = ivv.DIMENSION_1_VALUE
629  and tdr.DIMENSION_2_NAME  = ivv.DIMENSION_2_NAME
630  and tdr.DIMENSION_2_VALUE = ivv.DIMENSION_2_VALUE
631  and tdr.PERIOD_DIMENSION_NAME  = ivv.PERIOD_DIMENSION_NAME
632  and tdr.PERIOD_DIMENSION_VALUE = ivv.PERIOD_DIMENSION_VALUE
633order by INDICATOR_VIEW_NAME
634;
635*/
636select count(ivv.INDICATOR_VIEW_NAME)
637from INDICATOR_VIEW_VALUE ivv
638left join TMP_DATASET_RECORD tdr on
639  tdr.INDICATOR_VIEW_NAME = ivv.INDICATOR_VIEW_NAME
640  and tdr.DIMENSION_1_NAME  = ivv.DIMENSION_1_NAME
641  and tdr.DIMENSION_1_VALUE = ivv.DIMENSION_1_VALUE
642  and tdr.DIMENSION_2_NAME  = ivv.DIMENSION_2_NAME
643  and tdr.DIMENSION_2_VALUE = ivv.DIMENSION_2_VALUE
644  and tdr.PERIOD_DIMENSION_NAME  = ivv.PERIOD_DIMENSION_NAME
645  and tdr.PERIOD_DIMENSION_VALUE = ivv.PERIOD_DIMENSION_VALUE
646;
647
648/* Possible issue of dup keys for different IPVs with different measure values. */
649select * 
650from TMP_DATASET_RECORD
651group by INDICATOR_NAME, MEASURE_NAME,
652        DIMENSION_1_NAME, DIMENSION_1_VALUE,
653        DIMENSION_2_NAME, DIMENSION_2_VALUE,
654        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE
655having count(MEASURE_NAME) > 1
656  and count(DIMENSION_1_NAME) > 1
657  and count(DIMENSION_1_VALUE) > 1
658  and count(DIMENSION_2_NAME) > 1
659  and count(DIMENSION_2_VALUE) > 1
660  and count(PERIOD_DIMENSION_NAME) > 1
661  and count(PERIOD_DIMENSION_VALUE) > 1
662;
663
664
665
666
667/* test any datasets that don't have any records? */
668select * from DATASET
669where NAME not in (select DATASET_NAME from DATASET_RECORD)
670;
671
672/* test IPs that do NOT have a dataset */
673select NAME from INDICATOR
674where NAME not in (select INDICATOR_NAME from DATASET)
675;
676
677/* show DSR record count by dataset */
678select DATASET_NAME, count(DATASET_NAME) 
679from DATASET_RECORD
680group by DATASET_NAME
681;
682
683/* show IVVs count by IV */
684select INDICATOR_VIEW_NAME, count(INDICATOR_VIEW_NAME) 
685from INDICATOR_VIEW_VALUE
686group by INDICATOR_VIEW_NAME
687;
688
689
690/* spot check */
691select INDICATOR_NAME, count(INDICATOR_VIEW_NAME) count_ivn,
692        MEASURE_NAME,
693        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE,
694        DIMENSION_1_NAME, DIMENSION_1_VALUE,
695        DIMENSION_2_NAME, DIMENSION_2_VALUE
696from TMP_DATASET_RECORD
697where INDICATOR_NAME like 'LBW%'
698group by INDICATOR_NAME, MEASURE_NAME,
699        DIMENSION_1_NAME, DIMENSION_1_VALUE,
700        DIMENSION_2_NAME, DIMENSION_2_VALUE,
701        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE
702having count(INDICATOR_VIEW_NAME) > 1
703;
704
705select INDICATOR_VIEW_NAME, 
706        MEASURE_NAME,
707        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE,
708        DIMENSION_1_NAME, DIMENSION_1_VALUE,
709        DIMENSION_2_NAME, DIMENSION_2_VALUE
710from TMP_DATASET_RECORD
711where INDICATOR_NAME like 'LBW%'
712  and PERIOD_DIMENSION_NAME = 'Year'
713  and PERIOD_DIMENSION_VALUE = 2019
714  and DIMENSION_1_NAME = 'Country'
715  and DIMENSION_1_VALUE = 'US'
716  and MEASURE_NAME = 'PctBirthLow'
717;
718
719select INDICATOR_NAME, count(INDICATOR_VIEW_NAME) count_ivn,
720        MEASURE_NAME,
721        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE,
722        DIMENSION_1_NAME, DIMENSION_1_VALUE,
723        DIMENSION_2_NAME, DIMENSION_2_VALUE
724from TMP_DATASET_RECORD
725group by INDICATOR_NAME, MEASURE_NAME,
726        DIMENSION_1_NAME, DIMENSION_1_VALUE,
727        DIMENSION_2_NAME, DIMENSION_2_VALUE,
728        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE
729having count(INDICATOR_VIEW_NAME) > 1
730limit 5000
731;
732
733/* Show temp DSRs that have multiple view names */ 
734select distinct(INDICATOR_NAME) 
735from 
736        (
737                select INDICATOR_NAME from TMP_DATASET_RECORD
738                group by INDICATOR_NAME, MEASURE_NAME,
739                        DIMENSION_1_NAME, DIMENSION_1_VALUE,
740                        DIMENSION_2_NAME, DIMENSION_2_VALUE,
741                        PERIOD_DIMENSION_NAME, PERIOD_DIMENSION_VALUE
742                having count(INDICATOR_VIEW_NAME) > 1
743        ) as a
744;
745
746/*------------------------------ END OF FILE ------------------------------*/
Note: See TracBrowser for help on using the repository browser.