====== geosync.pl ====== ===== Info ===== Small perl script that downloads the [[http://geonames.org]] geographical database and replicates it in a local database. It can be re-run periodically to stay in sync (eg via [[wp>cron]]). ==== Usage ==== ==== Examples ==== ===== Source ===== #!/usr/bin/perl # # geosync.pl: # small perl script to import/sync data from geoname.org # to a local postgres database. # # Copyright: (C) 2006, Robin Gareus # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # use DBI; # # config # $datafile="allCountries"; #$datafile="NL"; $cfg{dbname}="zkm2"; $cfg{dbuser}="zkm"; $cfg{dbpass}=""; $cfg{tablename}="latlon2"; $cfg{codeURL}="http://download.geonames.org/export/dump/admin1Codes.txt"; $cfg{dataURL}="http://download.geonames.org/export/dump/$datafile.zip"; # this one is a Tab-separated file # with information sourced from http://download.geonames.org/countries/ $countryfile="/usr/home/rgareus/geosync/cc2a.csv"; $tmppath="/tmp/geomaps"; # These two are retrieved from geonames.org $zipfile=$tmppath."/".$datafile.".zip"; $regionfile=$tmppath."/admin1Codes.txt"; # That's the one in the zip file. $filename=$tmppath."/".$datafile.".txt"; $download=1; $verbose=0; $cleanup=1; $forceupdate=0; foreach (@ARGV) { if ($_ eq "-n") { $cleanup=0;} elsif ($_ eq "-d") { $download=0;} elsif ($_ eq "-v") { $verbose=1;} elsif ($_ eq "-f") { $forceupdate=1;} else { print " usage: geosync.pl [-d] [-v] [-n] [-f]\n -d : skip downloading the datafile.\n -v : verbose output\n -n : no not remove tmp files.\n -f : force update of entries - ignore modification_date.\n"; exit; } } # # download the data file # mkdir $tmppath; if (!$verbose) { $curlopts="-s"; $unzipopts="-qq"; } if ($download) { unlink $filename,$zipfile,$regionfile,$tmppath.'readme.txt'; if ($verbose) { print "downloading geonames database\n"; } `curl $curlopts $cfg{codeURL} > $regionfile`; `curl $curlopts $cfg{dataURL} > $zipfile`; `unzip $unzipopts -o $zipfile -d $tmppath`; } # #main code # $dbh=DBI->connect( 'dbi:Pg:dbname='.$cfg{dbname}, $cfg{dbuser}, $cfg{dbpass}, {RaiseError => $RAISE_ERROR, AutoCommit => 1}) or die ("Could not connect to database."); %regions=read_regions($regionfile); %countries=read_countries($countryfile); open(FH, "<:utf8", $filename) or die("Could not open geomaps file."); if (0) { print "(re)creating database table.\n"; create_table(); create_function(); } if ($verbose) { # count lines in data file print "checking data file..\n"; $ecnt=$lcnt=0; $total = `wc -l $filename | awk '//{print \$1 ;}'`; chop($total); $acnt=$ucnt=$icnt=$ncnt=0; print "starting data sync..\n"; } while () { my @data = split('\t',$_); my $fclass=@data[6]; my $fcode=@data[7]; $lcnt++; if ($verbose && ($lcnt%100 == 0)) { $|=1; print "progress: $lcnt / $total (post-filter: $ecnt - New: $ncnt Upd: $ucnt up-to-date: $icnt alt-name: $acnt) \r"; $|=0; } ## REGEXP FILTER geonames entries - cites and countries; # next unless ($fclass == 'p' || $fclass =='P' || $fclass == 'a' || $fclass =='A'); # next unless ($fcode =~/PPL[^WQ]?$/ || $fcode =~/PCL[ID]?$/); # REGEXP FILTER geonames entries - only cities... next unless ($fclass == 'p' || $fclass =='P'); next unless ($fcode =~/PPL[^WQR]?$/); ## REGEXP FILTER geonames entries - only countries... # next unless ($fclass == 'a' || $fclass =='A'); # next unless ($fcode =~/PCL[ID]?$/); #last if ($lcnt++ > 10000); #$lcnt++; $ecnt++; my $rv = add_geodate (\@data); ## extended info if ($verbose) { if ($rv==1) { $ucnt++; } elsif ($rv==2) { $ncnt++; } elsif ($rv==0) { $icnt++; } } $acnt += add_geodate_alternatives(\@data); } if ($verbose) { print "\n"; } if ($cleanup) { if ($verbose) { print "cleaning up.\n"; } unlink $filename,$zipfile,$regionfile,$tmppath.'readme.txt'; rmdir $tmppath; } if ($verbose) { print "\n"; } exit; # # PRIVATE FUNCTIONS # sub add_geodate_alternatives() { (my $id, my $name, my $asciiname, my $alternatenames, my $latitude, my $longitude, my $fclass, my $fcode, my $country, my $cc2, my $admin1, my $population, my $elevation, my $gtopo30, my $timezone, my $modification_date) = @{$_[0]}; my @data = @{$_[0]}; my $counter=0; # FIXME : remove old all alt. names on update when there are none. # in case that *all* alt names are removed from entry. foreach( split(/,/ ,$alternatenames)) { @data[0]= sprintf("-1%02i".$id , $counter++); @data[1]= $_; add_geodate(\@data); } return ($counter); } ## insert or update data ## arg: one geodate entry - array-ref ## ## returns -1: on error (never!) ## returns 0: if entry was up-to-date ## returns 1: if entry was updated ## returns 2: if entry was newly created sub add_geodate() { (my $id, my $name, my $asciiname, my $alternatenames, my $latitude, my $longitude, my $fclass, my $fcode, my $country, my $cc2, my $admin1, my $population, my $elevation, my $gtopo30, my $timezone, my $modification_date) = @{$_[0]}; # print "+".$fcode." $name\n"; # print "+".$country.$admin1."- $name\n"; chop($modification_date); my $rv=-1; my $query=""; if (get_existing_id($id) == FALSE) { $rv=2; $query = "insert into ".$cfg{tablename}." " ."(name,lat,lon, country, country_id, region, modification_date,class,id)" ."values (?,?,?,?,?,?,?,?,?)"; } else { if (!$forceupdate && is_entry_up_to_date($id,$modification_date)) { return (0); } if ($id < 0 && (substr $id,0,4) == "-100") { # print "debug $id :: -1XX".(substr $id,4)."!\n"; # remove all entries "-1".<01-99>".substr($id,4) # for the case the number of _alt names has decreased: # this code can surely be impoved - eg. always delete next entry only. for (my $did=1;$did<100;$did++) { my $delid= sprintf("-1%02d%s",$did,(substr $id,4)); exec_sql_query("delete from ".$cfg{tablename}." where id=".$delid); #print "debug $id :: $delid!\n"; } } $rv=1; $query = "update ".$cfg{tablename}." set name=(?),lat=(?),lon=(?),country=(?),country_id=(?),region=(?),modification_date=(?),class=(?) where id=(?)"; } my $cid=$country; # $cc2 my $cnm=get_country_name($cid); my $sth=$dbh->prepare($query); $sth->execute($name, $latitude, $longitude, $cnm, $cid, get_region_name($country,$admin1), $modification_date, $fcode, $id); return ($rv); } ## re-create geo-class sort function ## see http://www.geonames.org/export/codes.html ## PPLC: capital ## PPLA: seat of a first-order administrative ## PPLG: seat of government of a political entity ## PPL : populated place ## PPLS : populated places ## PPLL : populated locality ## PPLX : section of populated place # prereq: `createlang plpgsql zkm2` # as user postgres # usage: # SELECT *,geoclass_sort(class) as sortcol FROM $tablename ORDER BY sortcol; # # TODO change order of ELSIF clauses to match database statistics. sub create_function{ exec_sql_query("drop FUNCTION geoclass_sort(VARCHAR);"); my $query = "CREATE FUNCTION geoclass_sort (VARCHAR) RETURNS integer AS '\n" ." DECLARE\n" ." geoclass ALIAS for \$1;\n" ." BEGIN\n" ." IF geoclass = ''PPLC'' THEN RETURN 1;\n" ." ELSIF geoclass = ''PPLA'' THEN RETURN 2;\n" ." ELSIF geoclass = ''PPLG'' THEN RETURN 3;\n" ." ELSIF geoclass = ''PPLS'' THEN RETURN 4;\n" ." ELSIF geoclass = ''PPL'' THEN RETURN 5; \n" ." ELSIF geoclass = ''PPLX'' THEN RETURN 10; \n" ." ELSIF geoclass = ''PPLL'' THEN RETURN 6; END IF;\n" ." RETURN 10;\n" ." END;'\n" ."LANGUAGE 'plpgsql';\n"; # print $query."\n"; exec_sql_query($query); } sub create_table{ exec_sql_query("drop table ".$cfg{tablename}.";"); my $query = "create table ".$cfg{tablename}." (" ."id integer, primary key(id)," ."name character varying(400)," #, key(name)," ."lat numeric(5,2)," ."lon numeric(5,2)," ."country character varying(255)," ."country_id character varying(2)," ."class character varying(8)," ."modification_date character varying(32)," ."region character varying(400))"; exec_sql_query($query); } ## returns the region name for a given admin code ## args: county-code, admin-code; sub get_region_name($,$) { my $reqcode=(shift).".".(shift); return $regions{$reqcode} } ## read admin1codes to hash and return the hash ## arg: filename sub read_regions($) { my $filename = shift; open(FH, "<:utf8", $filename) or die("Could not open geomaps Codes file."); my %regions; while () { chop(); my @data = split('\t',$_); $regions{@data[0]} = @data[1]; } return (%regions); } ## returns the country name for a given ISO-3166 code ## args: county-code sub get_country_name($) { my $code=(shift); return $countries{$code} } ## read admin1codes to hash and return the hash ## read admin1codes to hash and return the hash ## arg: filename sub read_countries($) { my $filename = shift; open(FH, "<:utf8", $filename) or die("Could not open geomaps Codes file."); my %countries; while () { chop(); my @data = split('\t',$_); $countries{@data[0]} = @data[4]; } return (%countries); } ## execute sql query ## arg: sql query sub exec_sql_query ($) { my $query = shift; my $sth=$dbh->prepare($query); $sth->execute(); return (0); } ## check if geo-ID already exists in city-table ## arg: geo-id ## returns integer id or -1 if not found. sub get_existing_id ($){ my $id=shift; my (%row,$queryrow,$sth); my $query="select * from ".$cfg{tablename}." where id=$id;"; $sth=$dbh->prepare($query); $sth->execute(); if (%row=%{$sth->fetchrow_hashref()}) { return($row{id}); } return (FALSE); } sub is_entry_up_to_date ($;$) { my $id=shift; my $date=shift; my (%row,$sth); # TODO : check if '!=' is better than '>=' (allow downgrades - but sync exactly) my $query="select * from ".$cfg{tablename}." where id=$id and modification_date >= '$date';"; $sth=$dbh->prepare($query); $sth->execute(); if (%row=%{$sth->fetchrow_hashref()}) { if($row{id} != 0 ) { return(1); } } return (0); } #// vim: sw=4 ts=4 sts=4: {{tag>FLOSS WWW}}