#!/usr/bin/php
<?php
	define('VERSION','2.09');

	// this is defined in squid.conf:
	//
	// logformat osfetcher %{%Y-%m-%dT%H:%M:%S}tl %Ss/%03Hs %rm %ru %mt
	// access_log /var/log/squid/opensuse.log osfetcher
	// 
	$accesslog='/var/log/squid/fetch206.log';

	$accesslog_mtime=0;
	$accesslog_lastpos=0;
	$mirrorlist_mtime=0;

	$mirrorlist='/tmp/opensuse.mirrors';

	$maxjobs=50;

	function load_mirrors( $file )
	{

	    if ( NULL==$f=fopen($file,'r') )
	    {
		return array();
	    }

	while( $line=fgets($f) )
	{
		preg_match('@[a-z]+://([^/\n]+)(.*)$@', $line, $match );

		$host=$match[1];
		$path=$match[2];
		//printf("%s: %s\n", $host, $path);

		$m[$host]=$path;
	}
	fclose($f);

	    return $m;
	}


	$jobs=array();

	putenv('http_proxy=http://localhost:3128');

	openlog('fetcher206', LOG_PID, LOG_DAEMON);

        syslog(LOG_INFO, sprintf('fetcher206 v%s started', VERSION) );


	$activejobs=0;
	$queuedjobs=0;
	$jobstat=0;
	while( TRUE )
	{
		$now=time();

		$s=$activejobs.'/'.$queuedjobs;
		if ( $jobstat!=$s )
		{
		    syslog(LOG_DEBUG, sprintf('jobs: %d queued, %d active', $queuedjobs, $activejobs) );
		    $jobstat=$s;
		}
	

		// check filestats
		clearstatcache();
		// check to see if the mirrors list was updated and reload 
		$m=filemtime($mirrorlist);
                if ( $m!=$mirrorlist_mtime )
		{
			$mirrors=load_mirrors($mirrorlist);
			$mirrorlist_mtime=$m;
			syslog(LOG_NOTICE,sprintf('mirror list re/loaded, %u entries, last changed %s', count($mirrors), strftime('%Y/%m/%d %T', $m) ));
		}


		// check for children having finished and for ones to be started
		foreach( $jobs as $pkg => $j )
		{
		    switch( $j['state'] )
		    {
			case 1:		// package pending retrieval.
			    if ( $activejobs<$maxjobs )
			    {
                                if ( 0==($pid=pcntl_fork()) )
                                {
                                    syslog(LOG_DEBUG,sprintf("fetching %s\n", $j['url'] ) );
                                    $a=array('-q','-nd','-O','/dev/null',$j['url']);
                                    pcntl_exec('/usr/bin/wget',$a);
                                    exit;
                                }
                                $activejobs++;
                                $jobs[$pkg]['pid']=$pid;
                                $jobs[$pkg]['start']=$now;
			        $jobs[$pkg]['state']=2;
                            }
			    break;
			case 2:
			    if ( pcntl_waitpid( $j['pid'],$s,WNOHANG )>0 )
			    { 
				    syslog(LOG_DEBUG,sprintf("job complete: %s, elapsetime %us", $pkg, $now-$j['start']) );

				    $activejobs--;
				    $queuedjobs--;
				    $jobs[$pkg]['pid']=0;
				    $jobs[$pkg]['expire']=$now+4*3600;
				    $jobs[$pkg]['state']=3;
			    }
			    break;
			case 3:
			    if ( $now>$j['expire'] )
			    {
			        syslog(LOG_DEBUG,sprintf("job expired: %s", $pkg ));
			        unset( $jobs[$pkg] );
			    }
			    break;
			}
		}


	    $m=filemtime($accesslog);
	    if ( $m==$accesslog_mtime )
	    {
		if ( $activejobs>0 || $queuedjobs>0 )
		{
		    sleep(1);
		    continue;
		}
		//printf("No change in %s, going to sleep for %u seconds\n", $accesslog, $m, $accesslog_mtime, 60);
		sleep(5);
		continue;
	    }


	    $accesslog_mtime=$m;
	    $logsize=filesize($accesslog);

		// open log, seek to last place we read, try to read from there
		if ( $f=fopen($accesslog, 'r') )
		{
		    // if we would attempt to seek past end-of-file, the logfile has probably
		    // been rotated, so just start from 0.
		    if ( $logsize<$accesslog_lastpos ) $accesslog_lastpos=0;

		    fseek($f,$accesslog_lastpos,SEEK_SET);

		    $oldjobs=$queuedjobs;
		    $restarted=0;
			while( $line=fgets($f) )
			{
				// TCP_MISS/200 GET http://servi
				if ( preg_match('/^[^\s]+\s([^\s\/]+)\/([^\s]+)\s([^\s]+)\s([^\s]+)\s(.*)$/',$line,$match) )
				{
				    $status=$match[1];
				    $code  =$match[2];
				    $method=$match[3];
				    $url   =$match[4];

				    if ( $method!='GET' ) continue;
				    if ( $status!='TCP_MISS' ) continue;
				    if ( $code!='206' ) continue;

				    if ( preg_match('#^[^:]+://([^/]+)(/.*)/([^/]+)$#',$url,$match) )
				    {
					$host=$match[1];
					$path=$match[2];
					$pkg=$match[3];
					// then check if hostname is one of the opensuse mirrors
					if ( array_key_exists($host,$mirrors) )
					{
						$localpath=$mirrors[$host];
						$pkgpath=substr($path,strlen($localpath));

						if ( !array_key_exists($pkgpath.'/'.$pkg,$jobs) )
						{
						    //syslog(LOG_DEBUG, sprintf('New pkg: %s (%s)', $pkg, $url ));
						    $jobs[$pkgpath.'/'.$pkg]=array('state'=>1,'url'=>$url,'pid'=>0,'expire'=>0);
						    $queuedjobs++;
						}
						else if ( $jobs[$pkgpath.'/'.$pkg]['state']==3 )
						{
						    // we got a MISS/206 on something already in queue? if state is 3, let's retry it.
						    $jobs[$pkgpath.'/'.$pkg]['state']=1;
						    $queuedjobs++;
						    $restarted++;
						}
					}
					//else syslog(LOG_DEBUG, sprintf('Not an openSUSE mirror: %s', $host));
				    }
				}
			}
			$accesslog_lastpos=ftell($f);
			fclose($f);
			if ( $queuedjobs>$oldjobs || $restarted>0 )
			syslog(LOG_DEBUG, sprintf('jobs: %u added, %u restarted', $queuedjobs-$oldjobs, $restarted));
		}
		else syslog(LOG_ERROR, sprintf("Unable to open %s for reading\n", $accesslog));
	}
?>
