#!/usr/bin/tclsh ## $Id$ ## $Author$ puts "Loading Dortmunder ... pint = 1" package require tdom package require http package require mysqltcl package require csv package require inifile proc setvar { var value } { upvar 1 $var v set v $value } proc setup_values { line } { set newline [list] foreach item [ string trim [ ::csv::split $line ","]] { regsub -all {"} $item "" item if { [string length $item] == 0 || $item == "NULL" } { lappend newline '' } else { if {[string is digit $item]} { lappend newline $item } else { regsub -all {'} $item "''" item lappend newline "'$item'" } } } return [::csv::join $newline ","] } proc get_dbh { inifile } { if {[catch {set inifp [ini::open $inifile ]} errmsg]} { puts stderr "Error opening inifile $inifile: $errmsg" exit 1 } set section "database"; array set db {}; foreach key [ini::keys $inifp $section] { set db($key) [ini::value $inifp $section $key] } return [mysqlconnect -host $db(host) -user $db(user) -password $db(password) -db $db(db)] return $db } proc mysql_result { db sql } { ## puts "DEBUG: $sql" if {[catch {mysqlexec $db $sql} errmsg]} { puts "Error inserting into games: $errmsg" puts $sql return } } proc get_fields { node fields } { set line [list] foreach field $fields { set value [$node getAttribute $field "NULL"] if { $value == "" } { set value "NULL" } if { $field == "dob" } { set info [split $value "/"] set value "[lindex $info 2]-[lindex $info 1]-[lindex $info 0]" } lappend line $value } return $line } puts "Dortmunder still loading, pint refilled .. pint = 1" proc get_players_data { db gameid dir} { set player_fields [list team id pos first_name last_name jersey_number height weight bats throws dob] set players_dir [file join $dir "batters"] if {![catch {glob [file join $players_dir "*.xml"]} files]} { foreach player $files { if {[file exists $player]} { set filename $player set xml [readfile $filename] if {![catch {set doc [dom parse $xml]}]} { set root [$doc documentElement] set name [$root nodeName] set line [list]; lappend line $gameid foreach value [get_fields $root $player_fields] { lappend line $value } mysql_result $db "insert into gameday_players values ([setup_values [::csv::join $line ","]]);" } } } } } proc get_gameday_game_info { db gameid lg boxscore games } { # we need some information from the boxscore set xml $boxscore set doc [dom parse $xml] set root [$doc documentElement] set name [$root nodeName] set boxscore [$root selectNode /boxscore] set game_id [$boxscore getAttribute game_id ""] set game_date [$boxscore getAttribute date ""] set home_team [$boxscore getAttribute home_team_code ""] set away_team [$boxscore getAttribute away_team_code ""] set away_id [$boxscore getAttribute away_id ""] set home_id [$boxscore getAttribute home_id ""] set game_date [join [lrange [split $game_id "/"] 0 2] "/"] # set type [string range $gameid 18 20] puts "Game_id: $gameid, Date: $game_date, $lg, $away_team ($away_id) @ $home_team ($home_id)" set linescore [$root selectNode /boxscore/linescore] set away_runs [$linescore getAttribute away_team_runs ""] set home_runs [$linescore getAttribute home_team_runs ""] set away_hits [$linescore getAttribute away_team_hits ""] set home_hits [$linescore getAttribute home_team_hits ""] set away_errors [$linescore getAttribute away_team_errors ""] set home_errors [$linescore getAttribute home_team_errors ""] # now we need the game information if { $games == "" } { set games "" } else { set xml $games } set game_fields [list type local_game_time gameday_sw] set team_fields [list type code abbrev id name w l league] set stadium_fields [list id name location] if {![catch {set doc [dom parse $xml]}]} { set root [$doc documentElement] set name [$root nodeName] set line $gameid set game_type [$root getAttribute type ""] set local_game_time [$root getAttribute local_game_time ""] set gameday_sw [$root getAttribute gameday_sw ""] set node [$root selectNode home] foreach node [$root selectNode *] { set name [$node nodeName] if { $name eq "team" } { set type [$node getAttribute type "NULL"] if { $type ne "NULL" } { set ${type}_team [$node getAttribute code "NULL"] set ${type}_id [$node getAttribute id "NULL"] set ${type}_name [$node getAttribute name "NULL"] set ${type}_w [$node getAttribute w "NULL"] set ${type}_l [$node getAttribute l "NULL"] set ${type}_lg [$node getAttribute league "NULL"] } } } if { $name eq "stadium" } { set stadium_id [$node getAttribute id "NULL"] set stadium_name [$node getAttribute name "NULL"] set stadium_loc [$node getAttribute location "NULL"] } } set values [list $gameid $game_type $lg $local_game_time $gameday_sw $home_team $home_id $home_name $home_w $home_l $home_lg $away_team $away_id $away_name $away_w $away_l $away_lg $home_runs $home_hits $home_errors $away_runs $away_hits $away_errors $stadium_id $stadium_name $stadium_loc] mysql_result $db "insert into gameday_game_info values ( [setup_values [::csv::join $values ","]]);" } proc get_boxscore_data { db xml gamedayid } { set doc [dom parse $xml] set root [$doc documentElement] set name [$root nodeName] set boxscore [$root selectNode /boxscore] set game_id [$boxscore getAttribute game_id "NULL"] set game_date [$boxscore getAttribute date "NULL"] set home_team [$boxscore getAttribute home_team_code "NULL"] set away_team [$boxscore getAttribute away_team_code "NULL"] set away_id [$boxscore getAttribute away_id "NULL"] set home_id [$boxscore getAttribute home_id "NULL"] set game_date [join [lrange [split $game_id "/"] 0 2] "/"] set type [string range $gamedayid 18 20] set linescore [$root selectNode /boxscore/linescore] set away_team_runs [$linescore getAttribute away_team_runs "NULL"] set home_team_runs [$linescore getAttribute home_team_runs "NULL"] set away_team_hits [$linescore getAttribute away_team_hits "NULL"] set home_team_hits [$linescore getAttribute home_team_hits "NULL"] set away_team_errors [$linescore getAttribute away_team_errors "NULL"] set home_team_errors [$linescore getAttribute home_team_errors "NULL"] foreach inning [$linescore selectNode inning_line_score] { set away_runs [$inning getAttribute away "-1"] set home_runs [$inning getAttribute home "-1"] set inning [$inning getAttribute inning] if {$away_runs eq "" } { set away_runs "-2" } if {$home_runs eq "" } { set home_runs "-2" } if {$away_runs eq "x"} { set away_runs "-1" } if {$home_runs eq "x"} { set home_runs "-1" } set values [::csv::join [list $gamedayid $inning $home_runs $away_runs] ","] mysql_result $db "insert into gameday_linescore values ( [setup_values $values]);" } set batting [$root selectNode /boxscore/batting] foreach batting_node $batting { set type [$batting_node getAttribute team_flag "NULL"] set batting_ids [$batting_node selectNode batter] foreach batter $batting_ids { set player_id [$batter getAttribute id "NULL"] set pos [$batter getAttribute pos "NULL"] set bo [$batter getAttribute bo "000" ] set h [$batter getAttribute h "NULL"] set po [$batter getAttribute po "NULL"] set hr [$batter getAttribute hr "NULL"] set bb [$batter getAttribute bb "NULL"] set so [$batter getAttribute so "NULL"] set rbi [$batter getAttribute rbi "NULL"] set ab [$batter getAttribute ab "NULL"] set r [$batter getAttribute r "NULL"] set t [$batter getAttribute t "NULL"] set d [$batter getAttribute d "NULL"] set lob [$batter getAttribute lob "NULL"] set a [$batter getAttribute a "NULL"] set e [$batter getAttribute e "0"] set sb [$batter getAttribute sb "0"] set cs [$batter getAttribute cs "0"] set pb [$batter getAttribute pb "0"] switch -- $type { away { set teamid $away_id } home { set teamid $home_id } default { puts "ERROR: away/home not defined in batting" } } set values [::csv::join [list $player_id $teamid $gamedayid $pos $h $hr $bb $so $rbi $ab $r $t $d $lob $sb $cs $bo] ","] mysql_result $db "insert into gameday_batting values ( [setup_values $values]);" set values [::csv::join [list $player_id $teamid $gamedayid $pos $po $a $e $pb] ","] mysql_result $db "insert into gameday_fielding values ( [setup_values $values]);" } } set pitching [$root selectNode /boxscore/pitching] foreach pitching_node $pitching { set type [$pitching_node getAttribute team_flag] set pitching_ids [$pitching_node selectNode pitcher] foreach pitcher $pitching_ids { set player_id [$pitcher getAttribute id "NULL"] set pos [$pitcher getAttribute pos "NULL"] set out [$pitcher getAttribute out "NULL"] set bf [$pitcher getAttribute bf "NULL"] set hr [$pitcher getAttribute hr "NULL"] set bb [$pitcher getAttribute bb "NULL"] set so [$pitcher getAttribute so "NULL"] set er [$pitcher getAttribute er "NULL"] set r [$pitcher getAttribute r "NULL"] set h [$pitcher getAttribute h "NULL"] set note [$pitcher getAttribute note "NULL"] set wins 0; set losses 0; set saves 0 set holds 0; set hopps 0; set sopps 0 if { $note ne "none" } { set s [split $note ')'] foreach part $s { if [regexp {^\((.*?),} $part -> a] { switch -exact -- $a { L { set losses 1 } W { set wins 1 } H { set hopps 1; set holds 1 } S { set saves 1; set sopps 1 } BH { set hopps 1 } BS { set sopps 1 } default { puts "ERROR, pitcher: value: $a note: $note" } } } } } switch -- $type { away { set teamid $away_id } home { set teamid $home_id } default { puts "ERROR: away/home not defined in batting" } } mysql_result $db "insert into gameday_pitching values ( [setup_values [::csv::join [list $player_id $teamid $gamedayid $pos $out $bf $hr $bb $so $er $r $h $wins $losses $holds $hopps $saves $sopps] ","]]);" } } } proc get_player_data { db xml url gamedayid } { set doc [dom parse $xml] set root [$doc documentElement] set name [$root nodeName] set umpires [$root selectNode umpires] if { $umpires != "" } { set ump_fields [list name position] foreach umpire [$umpires selectNode umpire] { set line [list $gamedayid] foreach value [get_fields $umpire $ump_fields] { lappend line $value } mysql_result $db "insert into gameday_umps values ( [setup_values [::csv::join $line ","]] );" } } set game [$root selectNode /game] foreach team [$game selectNode team] { set teamid [string tolower [$team getAttribute id]] foreach player [$team selectNode player] { set id [$player getAttribute id "NULL"] set first [$player getAttribute first "NULL"] set last [$player getAttribute last "NULL"] set num [$player getAttribute num "999"] set boxname [$player getAttribute boxname "NULL"] set b [$player getAttribute rl "X"] if {$num eq ""} { set num 999 } if {[string range $num 0 0] == "-"} { set num 999 } set values [::csv::join [list $gamedayid $teamid $id $first $last $num $boxname $b] ","] mysql_result $db "insert into gameday_rosters values ( [setup_values $values] );" } set coach_fields [list id first last num position] foreach coach [$team selectNode coach] { set line [list $gamedayid $teamid] foreach value [get_fields $coach $coach_fields] { lappend line $value } mysql_result $db "insert into gameday_coaches values ( [setup_values [::csv::join $line ","]] );" } } } proc get_hitting_data { db xml gamedayid } { set hip_fields [list x y inning batter pitcher type des team] set doc [dom parse $xml] set root [$doc documentElement] set name [$root nodeName] set hitchart [$root selectNode /hitchart ] set this_name [$hitchart nodeName] set hips [$hitchart selectNode *] foreach hip $hips { set this_name [$hip nodeName] set fields [set ${this_name}_fields] set line $gamedayid foreach value [get_fields $hip $fields] { lappend line $value } mysql_result $db "insert into gameday_hit_locations values ( [setup_values [::csv::join $line ","]] );" } } proc getnodes { node level } { set t [$node nodeName] # get attrs for the current node foreach b [$node attributes *] { puts "$level:[$node nodeName]:$b:[$node getAttribute $b]" } # get each node set n [$node selectNode *] if {[llength $n]>0} { set level "$level:$t" foreach a $n { getnodes $a $level } } } proc Q { value } { if { $value != "NULL" } { if {[string is ascii $value]} { set value [QQ $value] } } return $value } proc P { line } { set new_line [list] foreach item $line { lappend new_line [Q $item] } return $new_line } proc QQ { s } { return "\'[string map { ' '' } $s]\'" } proc pbp_data { db dir gameid } { set atbat_fields [list num b batter des event o pitcher s score stand] set pitch_fields [list id ax ay az break_angle break_length break_x break_y break_z des end_speed on_1b on_2b on_3b pfx_x pfx_z px pz start_speed sz_bot sz_top type vx0 vy0 vz0 x x0 y y0 z0 pitch_type type_confidence] set runner_fields [list id earned end event rbi score start] set action_fields [list pitch b des event o player s score] set po_fields [list des] set inning_dir [file join $dir "inning"] if {![catch {glob [file join $inning_dir "*.xml"]} inning_files]} { foreach inning $inning_files { if {[file exists $inning]} { set filename $inning set xml [readfile $filename] if {![catch {set doc [dom parse $xml]}]} { set root [$doc documentElement] set name [$root nodeName] if { $name != "inning" } { continue } set num [$root getAttribute num] foreach thishalf [$root childNodes] { if {[$thishalf nodeName] == "top"} { set half 0 } else { set half 1 } set items [list action atbat] set nodes [$thishalf selectNode *] foreach node $nodes { set this_name [$node nodeName] set fields [ set ${this_name}_fields] set line [list $gameid $num $half] foreach value [get_fields $node $fields] { lappend line $value } mysql_result $db "INSERT INTO gameday_${this_name} VALUES ( [setup_values [::csv::join $line ","]] );" set atbat_num [$node getAttribute num -1] set pitch_num 0 foreach c [$node selectNode *] { set this_name [$c nodeName] if {$this_name == "pitch"} { incr pitch_num } set line [list $gameid $num $half $atbat_num $pitch_num] set fields [ set ${this_name}_fields] foreach value [get_fields $c $fields] { lappend line $value } mysql_result $db "INSERT INTO gameday_$this_name VALUES ( [setup_values [::csv::join $line ","]] );" } } } } catch {$doc delete} } } } } proc readfile { file } { if {[catch {open $file "r"} fp]} { # puts "Error reading $file, $fp" return -1 } set data [read $fp] close $fp return $data } proc go_get_it { lg daydir urlList inifile } { set db [get_dbh $inifile] foreach url $urlList { set gamedayid [file tail $url] set rows [mysqlsel $db "select * from gameday_game_info where gameid = \'$gamedayid\'" -list] if { $rows > 0 } { puts "Already have data for $gamedayid" continue } set file [file join $url boxscore.xml] set data [readfile $file] if {$data == "-1" } { puts "boxscore data not found for [file tail $url]" } else { set boxscore $data set file [file join $url game.xml] set data [readfile $file] if { $data == "-1" } { puts "game.xml data not found for [file tail $url]" set data "" } get_gameday_game_info $db $gamedayid $lg $boxscore $data get_boxscore_data $db $boxscore $gamedayid } set file [file join $url players.xml] set data [readfile $file] if {$data == "-1" } { puts "player data not found for [file tail $url]" } else { get_player_data $db $data $url $gamedayid } set file [file join $url inning inning_hit.xml] set data [readfile $file] if {$data == "-1" } { puts "inning hit data not found for [file tail $url]" } else { get_hitting_data $db $data $gamedayid } set file [file join $url inning inning_1.xml] if {$data == "-1" } { puts "No file for inning 1 on [file tail $url] ?! Skipping pbp data" } else { pbp_data $db $url $gamedayid } if {[catch {glob [file join $url batters]} errMsg]} { puts "No batters directory" } else { get_players_data $db $gamedayid $url } } } proc sleep { N } { after [expr {int($N*1000)}] } proc createdir { dir } { if {[file exists $dir]} { if {![file isdirectory $dir] } { puts "$dir already exists, but is not a directory!" exit 1 } } else { if {[catch {file mkdir $dir} errmsg]} { puts "Error creating $dir: $errmsg" exit 1 } } } proc writefile { url dir file } { createdir $dir set token [http::geturl $url] if [http_data_ok_p $token] { set filename [file join $dir $file] if {[file exists $filename]} { # puts "$file exists" return 0 } if {[catch {open $filename "w"} fp]} { puts "Error creating $dir: $fp" exit 1 } set data [http::data $token] puts $fp $data close $fp } catch {http::cleanup $token} } proc http_data_ok_p { token } { switch -glob [http::ncode $token] { 404 { set found_p 0 } default { set found_p 1 } } return $found_p } proc pullOutTheURLs {html} { # Parse your HTML document into a DOM tree structur set doc [dom parse -html $html] # root will be the root element of your HTML document, # ie. the HTML element set root [$doc documentElement] # The following finds all anchor links . It isn't clear to me, # if you also interested in the urls of , and # elements. set nodeList "" catch {set nodeList [$root selectNodes {descendant::a}]} if {$nodeList == "" } { return {} } # init the result list set urlList {} # Pull out the Values of the href attributes foreach node $nodeList { set attList [$node attributes *] foreach attribute $attList { if {[string tolower $attribute] == "href"} { set url [$node getAttribute $attribute] #if {[string range $url 0 3] == "gid_"} { # lappend urlList [$node getAttribute $attribute] # break #} #puts $url lappend urlList [$node getAttribute $attribute] } } } # Get rid of the DOM representation of your HTML document $doc delete # finished return $urlList } proc drill_down { base level url } { set token [http::geturl $url] if {[http_data_ok_p $token]} { set urls [pullOutTheURLs [http::data $token]] foreach u $urls { if { [string index $u 0] == "/" } { continue } if { [string index $u end] == "/"} { set l "$level/[regsub -all {\/} $u ""]" drill_down $base $l "${url}${u}" } else { writefile ${url}${u} [file join $base $level] $u # puts "[file tail [string toupper $level]]: $u" } # sleep 1 } } else { puts "Could not get $url" } catch {http::cleanup $token} } proc get_files { base_url base_dir year month day } { set dayurl "$base_url/$year/$month/$day/" set basedir [file join $base_dir $year $month $day] createdir $basedir set urlList {} set token [http::geturl $dayurl] if {[http_data_ok_p $token]} { catch {set urlList [pullOutTheURLs [http::data $token]]} catch {http::cleanup $token} foreach url $urlList { set gamedayid [regsub -all {\/} $url ""] # puts $url if {[string range $url 0 3] == "gid_"} { puts "Working on $gamedayid" # puts "${dayurl}${url}" drill_down $basedir $gamedayid "${dayurl}${url}" } } } else { puts "Problem with $dayurl" } } puts "Done loading dortmunder, pint = 0"