====== geosync.pl ====== ===== Info ===== Small perl script that downloads the [[http://geonames.org]] geographical database and replicates it in a local database. It can be re-run periodically to stay in sync (eg via [[wp>cron]]). ==== Usage ==== ==== Examples ==== ===== Source =====

#!/usr/bin/perl
#
# geosync.pl:
# small perl script to import/sync data from geoname.org
# to a local postgres database.
#
# Copyright: (C) 2006, Robin Gareus 
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#

use DBI;

#
# config
#
$datafile="allCountries";
#$datafile="NL";

$cfg{dbname}="zkm2";
$cfg{dbuser}="zkm";
$cfg{dbpass}="";
$cfg{tablename}="latlon2";

$cfg{codeURL}="http://download.geonames.org/export/dump/admin1Codes.txt";
$cfg{dataURL}="http://download.geonames.org/export/dump/$datafile.zip";

# this one is a Tab-separated file
# with information sourced from http://download.geonames.org/countries/
$countryfile="/usr/home/rgareus/geosync/cc2a.csv";

$tmppath="/tmp/geomaps";

# These two are retrieved from geonames.org
$zipfile=$tmppath."/".$datafile.".zip";
$regionfile=$tmppath."/admin1Codes.txt";

# That's the one in the zip file.
$filename=$tmppath."/".$datafile.".txt";

$download=1;
$verbose=0;
$cleanup=1;
$forceupdate=0;

foreach (@ARGV) {
    if ($_ eq "-n") { $cleanup=0;}
    elsif ($_ eq "-d") { $download=0;}
    elsif ($_ eq "-v") { $verbose=1;}
    elsif ($_ eq "-f") { $forceupdate=1;}
    else { print " usage:  geosync.pl [-d] [-v] [-n] [-f]\n -d : skip downloading the datafile.\n -v : verbose output\n -n : no not remove tmp files.\n -f : force update of entries - ignore modification_date.\n"; exit; }
}

#
# download the data file
#

mkdir $tmppath;

if (!$verbose) {
    $curlopts="-s";
    $unzipopts="-qq";
}

if ($download) {
    unlink $filename,$zipfile,$regionfile,$tmppath.'readme.txt';
    if ($verbose) {
        print "downloading geonames database\n";
    }
    `curl $curlopts $cfg{codeURL} > $regionfile`;
    `curl $curlopts $cfg{dataURL} > $zipfile`;
    `unzip $unzipopts -o $zipfile -d $tmppath`;
}


#
#main code
#
$dbh=DBI->connect(
     'dbi:Pg:dbname='.$cfg{dbname}, $cfg{dbuser}, $cfg{dbpass},
     {RaiseError => $RAISE_ERROR, AutoCommit => 1})
    or die ("Could not connect to database.");

%regions=read_regions($regionfile);
%countries=read_countries($countryfile);

open(FH, "<:utf8", $filename)
    or die("Could not open geomaps file.");

if (0) {
    print "(re)creating database table.\n";
    create_table();
    create_function();
}

if ($verbose) {
    # count lines in data file
    print "checking data file..\n";
    $ecnt=$lcnt=0; $total = `wc -l $filename | awk '//{print \$1 ;}'`; chop($total);
    $acnt=$ucnt=$icnt=$ncnt=0;
    print "starting data sync..\n";
}
while () {
    my @data = split('\t',$_);
    my $fclass=@data[6];
    my $fcode=@data[7];

    $lcnt++;

    if ($verbose && ($lcnt%100 == 0)) {
        $|=1;
        print "progress: $lcnt / $total  (post-filter: $ecnt - New: $ncnt Upd: $ucnt up-to-date: $icnt alt-name: $acnt)      \r";
        $|=0;
    }
## REGEXP FILTER geonames entries - cites and countries;
#   next unless ($fclass == 'p' || $fclass =='P' || $fclass == 'a' || $fclass =='A');
#   next unless ($fcode =~/PPL[^WQ]?$/ || $fcode =~/PCL[ID]?$/);


# REGEXP FILTER geonames entries - only cities...
    next unless ($fclass == 'p' || $fclass =='P');
    next unless ($fcode =~/PPL[^WQR]?$/);

## REGEXP FILTER geonames entries - only countries...
#   next unless ($fclass == 'a' || $fclass =='A');
#   next unless ($fcode =~/PCL[ID]?$/);

    #last if ($lcnt++ > 10000);
    #$lcnt++;
    $ecnt++;

    my $rv = add_geodate (\@data);

## extended info
    if ($verbose) {
        if ($rv==1) { $ucnt++; }
        elsif ($rv==2) { $ncnt++; }
        elsif ($rv==0) { $icnt++; }
    }

    $acnt += add_geodate_alternatives(\@data);
}

if ($verbose) { print "\n"; }

if ($cleanup) {
    if ($verbose) { print "cleaning up.\n"; }
    unlink $filename,$zipfile,$regionfile,$tmppath.'readme.txt';
    rmdir $tmppath;
}

if ($verbose) { print "\n"; }
exit;
#
# PRIVATE FUNCTIONS
#

sub add_geodate_alternatives() {
    (my $id,
     my $name,
     my $asciiname,
     my $alternatenames,
     my $latitude,
     my $longitude,
     my $fclass,
     my $fcode,
     my $country,
     my $cc2,
     my $admin1,
     my $population,
     my $elevation,
     my $gtopo30,
     my $timezone,
     my $modification_date) = @{$_[0]};

    my @data = @{$_[0]};
    my $counter=0;
    # FIXME : remove old all alt. names on update when there are none.
    # in case that *all* alt names are removed from entry.
    foreach( split(/,/ ,$alternatenames)) {
        @data[0]= sprintf("-1%02i".$id , $counter++);
        @data[1]= $_;
        add_geodate(\@data);
    }
    return ($counter);
}

## insert or update data
## arg: one geodate entry - array-ref
##
## returns -1: on error  (never!)
## returns 0: if entry was up-to-date
## returns 1: if entry was updated
## returns 2: if entry was newly created

sub add_geodate() {
    (my $id,
     my $name,
     my $asciiname,
     my $alternatenames,
     my $latitude,
     my $longitude,
     my $fclass,
     my $fcode,
     my $country,
     my $cc2,
     my $admin1,
     my $population,
     my $elevation,
     my $gtopo30,
     my $timezone,
     my $modification_date) = @{$_[0]};

#   print "+".$fcode." $name\n";
#   print "+".$country.$admin1."-  $name\n";
    chop($modification_date);

    my $rv=-1;
    my $query="";
    if (get_existing_id($id) == FALSE) {
        $rv=2;
        $query = "insert into ".$cfg{tablename}." "
            ."(name,lat,lon, country, country_id, region, modification_date,class,id)"
            ."values (?,?,?,?,?,?,?,?,?)";
    } else {
        if (!$forceupdate && is_entry_up_to_date($id,$modification_date)) { return (0); }
        if ($id < 0 && (substr $id,0,4) == "-100") {
            # print "debug $id :: -1XX".(substr $id,4)."!\n";

            # remove all entries "-1".<01-99>".substr($id,4)
            # for the case the number of _alt names has decreased:
            # this code can surely be impoved - eg. always delete next entry only.
            for (my $did=1;$did<100;$did++) {
                my $delid= sprintf("-1%02d%s",$did,(substr $id,4));
                exec_sql_query("delete from ".$cfg{tablename}." where id=".$delid);
                #print "debug $id :: $delid!\n";
            }
        }
        $rv=1;
        $query = "update ".$cfg{tablename}." set name=(?),lat=(?),lon=(?),country=(?),country_id=(?),region=(?),modification_date=(?),class=(?) where id=(?)";
    }
    my $cid=$country; # $cc2
    my $cnm=get_country_name($cid);

    my $sth=$dbh->prepare($query);
    $sth->execute($name, $latitude, $longitude, $cnm, $cid,
        get_region_name($country,$admin1),
        $modification_date, $fcode, $id);
    return ($rv);
}

## re-create geo-class sort function
## see http://www.geonames.org/export/codes.html
##  PPLC: capital
##  PPLA: seat of a first-order administrative
##  PPLG: seat of government of a political entity
##  PPL : populated place
##  PPLS : populated places
##  PPLL : populated locality
##  PPLX : section of populated place
# prereq: `createlang plpgsql zkm2` # as user postgres
# usage:
#    SELECT *,geoclass_sort(class) as sortcol FROM $tablename ORDER BY sortcol;
#
# TODO change order of ELSIF clauses to match database statistics.
sub create_function{
    exec_sql_query("drop FUNCTION geoclass_sort(VARCHAR);");
    my $query = "CREATE FUNCTION geoclass_sort (VARCHAR) RETURNS integer AS '\n"            ." DECLARE\n"
            ."  geoclass ALIAS for \$1;\n"
            ." BEGIN\n"
            ."   IF geoclass = ''PPLC'' THEN RETURN 1;\n"
            ."   ELSIF geoclass = ''PPLA'' THEN RETURN 2;\n"
            ."   ELSIF geoclass = ''PPLG'' THEN RETURN 3;\n"
            ."   ELSIF geoclass = ''PPLS'' THEN RETURN 4;\n"
            ."   ELSIF geoclass = ''PPL'' THEN RETURN 5; \n"
            ."   ELSIF geoclass = ''PPLX'' THEN RETURN 10; \n"
            ."   ELSIF geoclass = ''PPLL'' THEN RETURN 6; END IF;\n"
            ."   RETURN 10;\n"
            ." END;'\n"
            ."LANGUAGE 'plpgsql';\n";

#   print $query."\n";
    exec_sql_query($query);
}


sub create_table{
    exec_sql_query("drop table ".$cfg{tablename}.";");
    my $query = "create table ".$cfg{tablename}." ("
        ."id integer, primary key(id),"
        ."name character varying(400)," #, key(name),"
        ."lat numeric(5,2),"
        ."lon numeric(5,2),"
        ."country character varying(255),"
        ."country_id character varying(2),"
        ."class character varying(8),"
        ."modification_date character varying(32),"
        ."region character varying(400))";
    exec_sql_query($query);
}

## returns the region name for a given admin code
## args: county-code, admin-code;
sub get_region_name($,$) {
    my $reqcode=(shift).".".(shift);
    return $regions{$reqcode}
}

## read admin1codes to hash and return the hash
## arg: filename
sub read_regions($) {
    my $filename = shift;
    open(FH, "<:utf8", $filename)
        or die("Could not open geomaps Codes file.");

    my %regions;
    while () {
         chop();
         my @data = split('\t',$_);
         $regions{@data[0]} = @data[1];
    }
    return (%regions);
}

## returns the country name for a given ISO-3166 code
## args: county-code
sub get_country_name($) {
    my $code=(shift);
    return $countries{$code}
}


## read admin1codes to hash and return the hash
## read admin1codes to hash and return the hash
## arg: filename
sub read_countries($) {
    my $filename = shift;
    open(FH, "<:utf8", $filename)
        or die("Could not open geomaps Codes file.");

    my %countries;
    while () {
         chop();
         my @data = split('\t',$_);
         $countries{@data[0]} = @data[4];
    }
    return (%countries);
}

## execute sql query
## arg: sql query
sub exec_sql_query ($) {
    my $query = shift;
        my $sth=$dbh->prepare($query);
    $sth->execute();
    return (0);
}

## check if geo-ID already exists in city-table
## arg: geo-id
## returns integer id or -1 if not found.
sub get_existing_id ($){
    my $id=shift;
    my (%row,$queryrow,$sth);
    my $query="select * from ".$cfg{tablename}." where id=$id;";
        $sth=$dbh->prepare($query);
    $sth->execute();
    if (%row=%{$sth->fetchrow_hashref()}) {
        return($row{id});
    }
    return (FALSE);
}

sub is_entry_up_to_date ($;$) {
    my $id=shift;
    my $date=shift;
    my (%row,$sth);
    # TODO : check if '!=' is better  than '>=' (allow downgrades - but sync exactly)
    my $query="select * from ".$cfg{tablename}." where id=$id and modification_date >= '$date';";
    $sth=$dbh->prepare($query);
    $sth->execute();
    if (%row=%{$sth->fetchrow_hashref()}) {
        if($row{id} != 0 ) { return(1); }
    }
    return (0);
}

#// vim: sw=4 ts=4 sts=4: