Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
<?php /* (C) 2010 - Antonio Ognio <antonio@ognio.com> */ /* This script scraps concert setlist data for Dream Theater in 2009 from setlist.fm */ /* MySQL schema: CREATE TABLE setlist ( id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT, concert_date DATE NOT NULL, venue VARCHAR(64) NOT NULL, position TINYINT UNSIGNED NOT NULL, song VARCHAR(64) NOT NULL, PRIMARY KEY(id), UNIQUE INDEX date_position_uidx (concert_date, position) ); */ function get_setlists_2009() { $baseurl = "http://www.setlist.fm/setlists/dream-theater-bd6a102.html"; $regexp = '|href="(.*setlist/dream-theater/2009/.*);.*"|'; $urls = array(); for($page=1; $page<=8; $page++) { $contents = file("$baseurl?page=$page"); foreach($contents as $line) { $line = chop($line); if (preg_match($regexp, $line, $matches)) { $u = str_replace('../', 'http://www.setlist.fm/', $matches[1]); if (in_array($u, $urls)===false) { $urls[] = $u; } } } } return $urls; } function grab_setlist($url) { $flag = false; $regexp = '|<span class="">(.+)</span>|'; $songs = array(); $contents = file($url); $date = NULL; $vanue = NULL; foreach($contents as $line) { /* extract date */ if (is_null($date) && preg_match( '|Dream Theater Concert at (.+) Setlist on (.+, 2009)</h1>|', $line, $matches)) { $venue = html_entity_decode($matches[1]); $date = date("Y-m-d", strtotime($matches[2])); } /* extract songs */ if (preg_match('|<ol>|', $line)) { $flag = True; } if ($flag) { if (preg_match($regexp, $line, $matches)) { $s = $matches[1]; $s = strip_tags($s); if (count($songs) === 0) { $songs[1] = $s; } else { $songs[] = $s; } } } if (preg_match('|</ol>|', $line)) { $flag = False; } } $result = array( 'date' => $date, 'venue' => $venue, 'songs' => $songs ); return $result; } function generate_inserts($setlist, $table="setlist") { $results = array(); extract($setlist); foreach($songs as $position=>$s) { $results[] = sprintf("INSERT INTO $table VALUES('', '%s', '%s', '%d','%s');", $date, $venue, $position, $s); } return $results; } $urls = get_setlists_2009(); foreach($urls as $u) { $set = grab_setlist($u); $inserts = generate_inserts($set); foreach($inserts as $sql) { printf("%s\n", $sql); } } ?>
This paste will be private.
From the Design Piracy series on my blog: