Tuesday, October 2, 2007

Great Race runner analysis methodology

Copied the results from the website, 100 runners at a time, and pasted them in an Excel spreadsheet. Saved the file in comma delimited format (.CSV). Then, used SQL Server 2000 to import the data.

Massaged and changed some of the data, as to replace NULL values in names, cities, states. Replaced single quotes with empty string.

update RunnerRaw set
Col004 = 'Unknown'
where Col004 IS NULL
go
update RunnerRaw set
Col005 = 'UK'
where Col005 IS NULL
go
update RunnerRaw set
Col008 = 'U'
where Col008 IS NULL
go
update RunnerRaw set
Col002 = (select replace(Col002,'''',''))

update RunnerRaw set
Col003 = (select replace(replace(Col003,'''',''),' ',''))

update RunnerRaw set
Col003 = (select replace(Col003,'Sr.',''))

update RunnerRaw set
Col003 = (select replace(Col003,'Jr.',''))

update RunnerRaw set
Col003 = (select replace(Col003,'St.','St. '))

update RunnerRaw set
Col004 = (select replace(replace(replace(Col004,'''',''),'.',''),',',''))

CREATE VIEW RunnerRawV
AS
select
CAST(Col001 as Integer) as 'Bib',
UPPER(Col002) as 'First',
UPPER(Col003) as 'Last',
UPPER(Col004) as 'City',
UPPER(Col005) as 'State',
UPPER(Col006) as 'Country',
CAST (Col007 as INTEGER) as 'Age',
UPPER(Col008) as 'Gender',
Col009 as 'ChipTime',
Col010 as 'ClockTime',
Cast(SubString (Col009, 1, 1) as Integer) * 3600 + CAST(SubString (Col009, 3, 2) as Integer) * 60 + CAST(SubString (Col009, 6, 2) as Integer) as 'TotalChipSeconds',
Cast(SubString (Col010, 1, 1) as Integer) * 3600 + CAST(SubString (Col010, 3, 2) as Integer) * 60 + CAST(SubString (Col010, 6, 2) as Integer) as 'TotalClockSeconds'
from RunnerRaw

CREATE Table Runner
(
Bib integer NOT NULL,
First varchar (20) NOT NULL,
Last varchar (30) NOT NULL,
City varchar (30) NOT NULL,
State char (2) NOT NULL,
Age Integer NOT NULL,
Gender char (1) NOT NULL,
ChipTime char (7) NOT NULL,
ClockTime char (7) not null,
TotalChipSeconds int not null,
TotalClockSeconds int not null
)

select 'INSERT INTO Runner (Bib, First, Last, City, State, Age, Gender, ChipTime, ClockTime, TotalChipSeconds, TotalClockSeconds) VALUES (' + CAST(Bib AS VARCHAR) + ',''' + LTRIM(RTRIM(First)) + ''',''' + LTRIM(RTRIM(Last)) + ''',''' + LTRIM(RTRIM(City)) + ''',''' + LTRIM(RTRIM(State)) + ''',' + CAST (Age AS VARCHAR) + ',''' + LTRIM(RTRIM(GENDER)) + ''',''' + LTRIM(RTRIM(ChipTime)) + ''',''' + LTRIM(RTRIM(ClockTime)) + ''',' + CAST (TotalChipSeconds AS VARCHAR) + ',' + CAST (TotalClockSeconds AS VARCHAR) + ')'
from RunnerRawV

select State, Replicate ('*', count(State) / 2)
from Runner
where State <> 'PA'
group by State
order by count(State) desc

select avg (totalclockseconds - totalchipseconds) from Runner

select count (First), First
from Runner
where Gender = 'F'
group by First
order by count(First) desc

select distinct First from Runner where Gender = 'F' and First in (select first from Runner where Gender = 'M')

select AVG (CAST (age as float)) from Runner where gender = 'F'

select totalclockseconds - totalchipseconds, *
from Runner
order by totalclockseconds - totalchipseconds desc

select TotalClockSeconds - TotalChipSeconds , * from Runner where TotalClockSeconds - TotalChipSeconds > 400
order by TotalClockSeconds - TotalChipSeconds desc

No comments: