--Copyright (c) 2007 Seth Schroeder -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE --AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, --OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN --THE SOFTWARE. ------------------------------------------------------------------------ -- Everything lives in the dhg schema DROP SCHEMA dhg; CREATE SCHEMA dhg; ------------------------------------------------------------------------ -- width_bucket does not exist in mysql ... -- this has had cursory testing but should be enhanced before any serious use! DROP FUNCTION dhg.width_bucket; DELIMITER | CREATE FUNCTION dhg.width_bucket(val INTEGER, _floor INTEGER, _ceiling INTEGER, _buckets INTEGER) RETURNS INTEGER DETERMINISTIC NO SQL BEGIN DECLARE bucket INTEGER; DECLARE range INTEGER; DECLARE valHeight INTEGER; DECLARE gapWidth DOUBLE; CASE WHEN val < _floor THEN -- _floor is included SET bucket = 0; WHEN val >= _ceiling THEN -- _ceiling is excluded SET bucket = _buckets + 1; ELSE SET range = _ceiling - _floor; SET valHeight = val - _floor; SET gapWidth = range / _buckets; -- valHeight / gapWidth returns a zero based number, but bucket counts begin at 1. -- no partial buckets or rounding of buckets! SET bucket = 1 + FLOOR(valHeight / gapWidth); END CASE; RETURN bucket; END; | DELIMITER ; ------------------------------------------------------------------------ -- Numbers to be analyzed go here. -- -- It's a shame to copy the data. A view would use the existing data. -- But changing data would mean dropping the view, which is a major hassle for -- views which rely on dhg.source. -- DROP TABLE dhg.source; CREATE TABLE dhg.source ( val INTEGER NOT NULL ); ------------------------------------------------------------------------ -- The number of buckets in the histogram are stored here. -- -- Warning: remember to UPDATE this table, not INSERT. Multiple rows here -- will cause cross joins / lots of unwanted rows in the results. -- DROP TABLE dhg.bucket_count; CREATE TABLE dhg.bucket_count ( num_buckets INTEGER NOT NULL ); INSERT INTO dhg.bucket_count (num_buckets) VALUES (10); ------------------------------------------------------------------------ -- meta is a central place for summary data. -- meta_full is a more verbose version. -- DROP VIEW dhg.meta; CREATE VIEW dhg.meta AS SELECT MAX(val) AS _max , MIN(val) AS _min , num_buckets FROM dhg.source , dhg.bucket_count GROUP BY num_buckets ; DROP VIEW dhg.meta_full; CREATE VIEW dhg.meta_full AS SELECT MAX(val) AS _max , MIN(val) AS _min , num_buckets , ROUND(AVG(val)) AS _avg , ROUND(STDDEV(val)) AS _stddev , MAX(val) - MIN(val) + 1 AS _range , ROUND((MAX(val) - MIN(val) + 1) / num_buckets) AS bucket_size FROM dhg.source , dhg.bucket_count GROUP BY num_buckets ; ------------------------------------------------------------------------ -- buckets associates each source number with its bucket -- -- width_bucket has an exclusive ceiling, so add 1 to force the -- max val into the results. -- DROP VIEW dhg.buckets; CREATE VIEW dhg.buckets AS SELECT val , dhg.width_bucket(val, _min, _max + 1, num_buckets) AS bucket FROM dhg.source , dhg.meta ; ------------------------------------------------------------------------ -- full_buckets groups the numbers by buckets. -- DROP VIEW dhg.full_buckets; CREATE VIEW dhg.full_buckets AS SELECT bucket , COUNT(*) contents FROM dhg.buckets GROUP BY bucket ; ------------------------------------------------------------------------ -- empty_buckets are used to fill gaps in the source data. -- -- Like the rest of the SQL, it's a quick and dirty hack. -- DROP TABLE dhg.empty_buckets; CREATE TABLE dhg.empty_buckets ( bucket INTEGER NOT NULL UNIQUE, contents INTEGER NOT NULL ); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (1, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (2, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (3, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (4, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (5, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (6, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (7, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (8, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (9, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (10, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (11, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (12, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (13, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (14, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (15, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (16, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (17, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (18, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (19, 0); INSERT INTO dhg.empty_buckets (bucket, contents) VALUES (20, 0); ------------------------------------------------------------------------ -- results displays the histogram. results_full includes more details. -- -- the left outer join from empty_buckets is probably the foulest thing -- I've done in a while, but it's for a better user interface! -- UI code is not my specialty. -- DROP VIEW dhg.results; CREATE VIEW dhg.results AS SELECT eb.bucket , COALESCE(fb.contents, 0) AS contents FROM dhg.empty_buckets eb LEFT OUTER JOIN dhg.full_buckets fb ON (eb.bucket = fb.bucket) , dhg.meta WHERE eb.bucket <= num_buckets ORDER BY bucket ; DROP VIEW dhg.results_full; CREATE VIEW dhg.results_full AS SELECT eb.bucket , COALESCE(fb.contents, 0) AS contents , ROUND(_min + bucket_size * (eb.bucket - 1)) _floor , ROUND(_min + bucket_size * eb.bucket - 1) _ceiling FROM dhg.empty_buckets eb LEFT OUTER JOIN dhg.full_buckets fb ON (eb.bucket = fb.bucket) , dhg.meta_full WHERE eb.bucket <= num_buckets ORDER BY bucket ; ------------------------------------------------------------------------ -- ALERT: You read to the end and found the bonus content. Nice! -- csv reformats the results into csv format. -- Good for importing & charting with OpenOffice.org or docs.google.com. DROP VIEW dhg.csv; CREATE VIEW dhg.csv AS SELECT bucket || ',' || contents FROM dhg.results ;