<?php
/* (C) 2010 - Antonio Ognio <antonio@ognio.com> */
/* This script scraps concert setlist data for Dream Theater in 2009 from setlist.fm */
/* MySQL schema:
CREATE TABLE setlist (
id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT,
concert_date DATE NOT NULL,
venue VARCHAR(64) NOT NULL,
position TINYINT UNSIGNED NOT NULL,
song VARCHAR(64) NOT NULL,
PRIMARY KEY(id),
UNIQUE INDEX date_position_uidx (concert_date, position)
);
*/
function get_setlists_2009() {
$baseurl = "http://www.setlist.fm/setlists/dream-theater-bd6a102.html";
$regexp = '|href="(.*setlist/dream-theater/2009/.*);.*"|';
$urls = array();
for($page=1; $page<=8; $page++) {
$contents = file("$baseurl?page=$page");
foreach($contents as $line) {
$line = chop($line);
if (preg_match($regexp, $line, $matches)) {
$u = str_replace('../', 'http://www.setlist.fm/', $matches[1]);
if (in_array($u, $urls)===false) {
$urls[] = $u;
}
}
}
}
return $urls;
}
function grab_setlist($url) {
$flag = false;
$regexp = '|<span class="">(.+)</span>|';
$songs = array();
$contents = file($url);
$date = NULL;
$vanue = NULL;
foreach($contents as $line) {
/* extract date */
if (is_null($date) && preg_match(
'|Dream Theater Concert at (.+) Setlist on (.+, 2009)</h1>|',
$line, $matches)) {
$venue = html_entity_decode($matches[1]);
$date = date("Y-m-d", strtotime($matches[2]));
}
/* extract songs */
if (preg_match('|<ol>|', $line)) {
$flag = True;
}
if ($flag) {
if (preg_match($regexp, $line, $matches)) {
$s = $matches[1];
$s = strip_tags($s);
if (count($songs) === 0) {
$songs[1] = $s;
} else {
$songs[] = $s;
}
}
}
if (preg_match('|</ol>|', $line)) {
$flag = False;
}
}
$result = array(
'date' => $date,
'venue' => $venue,
'songs' => $songs
);
return $result;
}
function generate_inserts($setlist, $table="setlist") {
$results = array();
extract($setlist);
foreach($songs as $position=>$s) {
$results[] = sprintf("INSERT INTO $table VALUES('', '%s', '%s', '%d','%s');", $date, $venue, $position, $s);
}
return $results;
}
$urls = get_setlists_2009();
foreach($urls as $u) {
$set = grab_setlist($u);
$inserts = generate_inserts($set);
foreach($inserts as $sql) {
printf("%s\n", $sql);
}
}
?>