Upload
others
View
6
Download
0
Embed Size (px)
Citation preview
URL Retrieval Procedure – Bulgarian National Statistical Institute
Initial data from BNSI Business Register
Businesses with 10 or more employees, with their contact e-mails and web sites urls uploaded to a
database.
26836 businesses, 20649 e-mails, 2006 urls.
Step 1
Check if the initial urls are real websites with the help of web script and save the results in the
database.
Step 2
Construct domain names from the initial e-mails with script, by excluding popular e-mail services
(like gmail, yahoo, etc.). Check if the constructed domains are real websites with the help of web
script and save the results in the database.
The combined results from step 1 and step 2 suggests 7038 possible urls of business.
Step 3
Search script that use automated search interface of http://www.jabse.com (Just Another Bulgarian
Search Engine).
Get up to 10 search results for the businesses from its names in Bulgarian.
Get up to 10 search results for the businesses from its names transliterated in Latin.
Excluding from the search results the complex urls, get just those up to domain name, and suggest
them as most probable.
Save the results in the database in text and html format.
15638 sets of up to 10 most probable search results in Bulgarian, 16201 sets of up to 10 most
probable search results in Latin.
Step 4
Search script that use Google search interface.
Get up to 10 search results for the businesses from its names in Bulgarian.
Save the results in the database in json format.
26829 sets of up to 10 search results.
Step 5
Manual crawling of the businesses by the participants in the project and choosing their real urls from
the suggested urls from the previous steps with the help of database crawling interface.
9809 real urls of businesses found.
ANNEX: URL retrieval BNSI scripts
<meta http-equiv="Refresh" content="30">
<?php
$link = mysql_connect('localhost', '', '')
or die('Няма връзка с базата данни ');
mysql_select_db('essbigdata') or die('Не съществува база данни, поискана от заявката ');
get_from_url();
function get_from_url(){
$sql="SELECT * FROM ikturl ORDER BY datechecked LIMIT 100";
$result = mysql_query($sql);
if($result){
while ($row = mysql_fetch_assoc($result)) {
echo "<br/>".$row["EIK"]." --- ";
$sql="UPDATE ikturl SET datechecked=".time()." WHERE
EIK='".$row["EIK"]."';";
$result1 = mysql_query($sql);
if(empty($row["Web"])){
$sql=get_from_email($row);
}else{
$opts = array('http' => array('proxy' => 'tcp://172.16.134.80:3128',
'request_fulluri' => true));
$context = stream_context_create($opts);
$data = file_get_contents("http://".$row["Web"], false, $context);
if(empty($data)){
$data = file_get_contents("http://www.".$row["Web"], false,
$context);
if(empty($data)){
$sql=get_from_email($row);
}else{
if(strpos($data, 'The requested URL could not be
retrieved') !== false){
$sql="UPDATE ikturl SET
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}else{
$sql="UPDATE ikturl SET
url='".$row["Web"]."', datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}
}
}else{
if(strpos($data, 'The requested URL could not be retrieved')
!== false){
$sql="UPDATE ikturl SET datechecked=".time()."
WHERE EIK='".$row["EIK"]."';";
echo $sql;
}else{
$sql="UPDATE ikturl SET url='".$row["Web"]."',
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}
}
}
$result1 = mysql_query($sql);
}
}
}
function get_from_email($row){
$domain = explode("@", $row["e_mail"]);
$domain=strtolower($domain[1]);
$notgood = array("abv.bg","mbox.contact.bg","b-
trust.org","dir.bg","mail.bg","mbox.is.bg","parvomai.escom.bg","ogosta.com","gbg.bg","gmail.com","
yahoo.com","mbox.is-bg.net");
if(in_array($domain,$notgood)){
$sql="UPDATE ikturl SET
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}else{
$opts = array('http' => array('proxy' => 'tcp://172.16.134.80:3128',
'request_fulluri' => true));
$context = stream_context_create($opts);
$data = file_get_contents("http://".$domain, false, $context);
if(empty($data)){
$data = file_get_contents("http://www.".$domain, false, $context);
if(empty($data)){
$sql="UPDATE ikturl SET
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}else{
if(strpos($data, 'The requested URL could not be retrieved')
=== false){
$sql="UPDATE ikturl SET url='www.".$domain."',
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}
}
}else{
if(strpos($data, 'The requested URL could not be retrieved') !==
false){
$sql="UPDATE ikturl SET
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}else{
$sql="UPDATE ikturl SET url='".$domain."',
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo $sql;
}
}
}
return $sql;
}
mysql_close($link);
?>
<meta http-equiv="Refresh" content="40">
Моля, поставете линк към <a href="http://www.jabse.com" class="tip">http://www.jabse.com</a>
на вашия сайт или в програмата, където използвате интерфейса.<br>
<?php
$cyr=array("а","б","в","г","д","е","ж","з","и","й","к","л","м","н","о","п","р","с","т","у","ф","х","ц",
"ч","ш","щ","ъ","ь","ю","я");
$lat=array("a","b","v","g","d","e","zh","z","i","j","k","l","m","n","o","p","r","s","t","u","f","h","c","ch
","sh","sht","j","ju","ja");
$link = mysql_connect('localhost', '', '')
or die('Няма връзка с базата данни ');
mysql_select_db('essbigdata') or die('Не съществува база данни, поискана от заявката ');
$sql="SELECT * FROM ikturl WHERE url_jabse_maybe_json='' OR
url_jabse_lat_maybe_json='' ORDER BY datechecked LIMIT 1";
$result = mysql_query($sql);
if($result){
while ($row = mysql_fetch_assoc($result)) {
$sql="UPDATE ikturl SET datechecked=".time()." WHERE
EIK='".$row["EIK"]."';";
$result1 = mysql_query($sql);
include_once "jabse_interface.php";
$js=new jabse_interface;
$jquery=trim($row["NAME"]);
$start=0;
echo "<br/>".$jquery;
$str="";
$strm="";
$ja="";
foreach($js->jabse_search($jquery,$start) as $a){
if (preg_match("/(\/\/.+\.\w+\/$|\/\/.+\.\w+\/.+bg\/$)/i", $a["url"])) {
echo "<li>";
echo "A match was found.<br/>";
echo "url: ".$a["url"]."<br/>";
echo "title: ".$a["title"]."<br/>";
echo "text: ".$a["text"]."<br/>";
echo "</li>";
if($strm==""){
$strm.=$a["url"];
}else{
$strm.=",".$a["url"];
}
$ja.="<maybe><url>".$a["url"]."</url><title>".$a["title"]."</title><text>".$a["text"]."</text></maybe
>";
$pos = stripos(mb_strtolower(utf8($a["title"]), 'UTF-8'), mb_strtolower(utf8($jquery), 'UTF-
8'));
$pos1 = stripos(mb_strtolower(utf8($a["text"]), 'UTF-8'), mb_strtolower(utf8($jquery), 'UTF-
8'));
if($pos !== false or $pos1 !== false){
echo "<li>";
echo "url: ".$a["url"]."<br/>";
echo "title: ".$a["title"]."<br/>";
echo "text: ".$a["text"]."<br/>";
echo "docsizekb: ".$a["docsizekb"]."<br/>";
echo "datecreated: ".$a["datecreated"]."<br/>";
echo "datemodified: ".$a["datemodified"]."<br/>";
echo "pagerating: ".$a["pagerating"]."<br/>";
echo "</li><br/>";
if($str==""){
$str.=$a["url"];
}else{
$str.=",".$a["url"];
}
}
}
}
$sql="UPDATE ikturl SET
url_jabse='".$str."', url_jabse_maybe='".$strm."', url_jabse_maybe_json='".str_replace("'",'"',$ja)."',
datechecked=".time()." WHERE EIK='".$row["EIK"]."';";
echo "<br/>".$sql;
$result1 = mysql_query($sql);
$jquery=str_replace($cyr, $lat, mb_strtolower(utf8($jquery), 'UTF-8'));
echo "<br/><br/><br/>".$jquery;
$str="";
$strm="";
$ja="";
foreach($js->jabse_search($jquery,$start) as $a){
if (preg_match("/(\/\/.+\.\w+\/$|\/\/.+\.\w+\/.+bg\/$)/i", $a["url"])) {
echo "<li>";
echo "A match was found.<br/>";
echo "url: ".$a["url"]."<br/>";
echo "title: ".$a["title"]."<br/>";
echo "text: ".$a["text"]."<br/>";
echo "</li>";
if($strm==""){
$strm.=$a["url"];
}else{
$strm.=",".$a["url"];
}
$ja.="<maybe><url>".$a["url"]."</url><title>".$a["title"]."</title><text>".$a["text"]."</text></maybe
>";
$pos = stripos(mb_strtolower(utf8($a["title"]), 'UTF-8'), mb_strtolower(utf8($jquery), 'UTF-
8'));
$pos1 = stripos(mb_strtolower(utf8($a["text"]), 'UTF-8'), mb_strtolower(utf8($jquery), 'UTF-
8'));
if($pos !== false or $pos1 !== false){
echo "<li>";
echo "url: ".$a["url"]."<br/>";
echo "title: ".$a["title"]."<br/>";
echo "text: ".$a["text"]."<br/>";
echo "docsizekb: ".$a["docsizekb"]."<br/>";
echo "datecreated: ".$a["datecreated"]."<br/>";
echo "datemodified: ".$a["datemodified"]."<br/>";
echo "pagerating: ".$a["pagerating"]."<br/>";
echo "</li><br/>";
if($str==""){
$str.=$a["url"];
}else{
$str.=",".$a["url"];
}
}
}
}
$sql="UPDATE ikturl SET
url_jabse_lat='".$str."', url_jabse_lat_maybe='".$strm."',
url_jabse_lat_maybe_json='".str_replace("'",'"',$ja)."', datechecked=".time()." WHERE
EIK='".$row["EIK"]."';";
echo "<br/>".$sql;
$result1 = mysql_query($sql);
}
}
mysql_close($link);
function utf8($utf8){
if(mb_detect_encoding($string,'UTF-8',true) =='UTF-8'){
}else{
$utf8=iconv("windows-1256","utf-8",$utf8);
}
return $utf8;
}
?>
<?php
class jabse_interface
{
var
//Public
$jquery,//search query
$start=0,//start from result line No
$start_numeric, //actual start line
$perpage=50,//lines per page
$rows, //total nuber of rows in result
$cols, // number of columns in result
$previous_page, //key for request for previous page
$next_page, //key for request for next page
$server_timestamp,
$result=array(),
$error_code=0, // Error codes: 1: $_GET[query] not set or too short
// 2: User key not found in database
// 3: Per hour limit exceeded
// 4: Charset not supported
// 5: Perpage value too high
// 6: Can not open url
//Private
$url="http://www.jabse.com/interface.php", //url of the jabse interface
$charset='utf-8', //charset of the returned result. Supported values are windows-1251 and utf-8
$siteid='0', // the id of your site, if you want to search only 1 site
$serverkey='', //the key you received at registration - should be sent with every query
$retry_limit=1, // retries for opening the search url
$handle, //file handle for the search url
$EndOfControlMarker='#',
$delimiter=',',
$skip_bolding='false' //true to disable adding of <b></b> around found search words, false to
enable
;
//do not use constructor - may change some values before searching
function jabse_search($jquery,$start)
{
$this->jquery=$jquery;
$this->start=$start;
//retry here
$this->send_search();
if ($this->handle){
$this->process_search();
}else{
$this->error_code=6;
}
//print_r($this->result);
return $this->result;
}
function process_search()
{
$contents = '';
while (!feof($this->handle))
{
$contents .= fread($this->handle, 8192);
}
fclose($this->handle);
$contents_array=explode("\n",$contents);
$status=explode(',',$contents_array[0]);
if ($status[0]=='0') //error
$this->error_code=$status[1];
else
{
$this->rows=$status[2];
$this->cols=$status[3];
$this->previous_page=$status[6];
$this->next_page=$status[7];
$this->server_timestamp=$status[8];
$this->start_numeric=$status[9];
for ($cc=1;$cc<sizeof($contents_array)-1;$cc++)
{
//get control elements
$markerpos=strpos($contents_array[$cc],$this->EndOfControlMarker);
$control_arr=explode($this->delimiter,substr($contents_array[$cc],0,$markerpos));
$this->result[$cc]['url']=substr($contents_array[$cc],$markerpos+1,$control_arr[0]);
$this-
>result[$cc]['title']=substr($contents_array[$cc],$markerpos+1+$control_arr[0],$control_arr[1]);
$this-
>result[$cc]['text']=substr($contents_array[$cc],$markerpos+1+$control_arr[0]+$control_arr[1],$cont
rol_arr[2]);
$this-
>result[$cc]['docsizekb']=substr($contents_array[$cc],$markerpos+1+$control_arr[0]+$control_arr[1]
+$control_arr[2],$control_arr[3]);
$this-
>result[$cc]['datecreated']=substr($contents_array[$cc],$markerpos+1+$control_arr[0]+$control_arr[1
]+$control_arr[2]+$control_arr[3],$control_arr[4]);
$this-
>result[$cc]['datemodified']=substr($contents_array[$cc],$markerpos+1+$control_arr[0]+$control_arr
[1]+$control_arr[2]+$control_arr[3]+$control_arr[4],$control_arr[5]);
$this-
>result[$cc]['pagerating']=substr($contents_array[$cc],$markerpos+1+$control_arr[0]+$control_arr[1]
+$control_arr[2]+$control_arr[3]+$control_arr[4]+$control_arr[5],$control_arr[6]);
}//for
}//else
}//process_search()
function send_search()
{
$url=$this->url."?jquery=".urlencode($this->jquery)."&start=".$this->start."&perpage=".$this-
>perpage."&charset=".$this->charset."&serverkey=".$this->serverkey."&siteid=".$this-
>siteid."&skip_bolding=".$this->skip_bolding;
$retries=0;
echo "<br/>".$url;
do
{
$opts = array('http' => array('proxy' => 'tcp://172.16.134.80:3128',
'request_fulluri' => true));
$context = stream_context_create($opts);
$this->handle=@fopen($url,"r",false,$context);
$retries++;
} while ((!$this->handle) && ($retries<$this->retry_limit));
}//send search
}//jabse_interface class ends
?>
<meta http-equiv="Refresh" content="9000">
<?php
$link = mysql_connect('localhost', '', '')
or die('Няма връзка с базата данни ');
mysql_select_db('essbigdata') or die('Не съществува база данни, поискана от заявката ');
$sql="SELECT * FROM ikturl WHERE url_google_meybe='' ORDER BY
datechecked_google LIMIT 1";
$result = mysql_query($sql);
if($result){
while ($row = mysql_fetch_assoc($result)) {
$sql="UPDATE ikturl SET datechecked_google=".time()." WHERE
EIK='".$row["EIK"]."';";
$result1 = mysql_query($sql);
$json=str_replace("'","''",send_search(trim($row["NAME"]),2));
echo mb_detect_encoding($json, mb_detect_order(), true);
$sql="UPDATE ikturl SET url_google_meybe='".$json."',
datechecked_google=".time()." WHERE EIK='".$row["EIK"]."';";
$result1 = mysql_query($sql,$link);
echo mysql_errno($link) . ": " . mysql_error($link) . "\n";
echo $sql;
}
}
mysql_close($link);
function send_search($q,$r)
{
$url="https://www.googleapis.com/customsearch/v1?key=???&q=".urlencode($q)."";
$retries=0;
$handle="";
do
{
$opts = array('http' => array('proxy' => 'tcp://172.16.134.80:3128',
'request_fulluri' => true));
$context = stream_context_create($opts);
$handle=file_get_contents($url,false,$context);
$retries++;
} while ((!$handle) && ($retries<$r));
return $handle;
}//send search
?>
<html>
<head>
<meta charset="UTF-8">
<style>
table{
width: 100%;
}
td div:nth-child(odd) {
background: #E5ECEC;
}
p{
word-wrap:break-word;
}
table, th, td {
border: 1px solid black;
}
table td {
vertical-align:top;
}
form{
margin-bottom:0px;
}
br{
line-height: 5px;
}
div {
margin-bottom:10px;
padding: 3px;
}
td {
margin-bottom:10px;
padding: 3px;
max-width:10px;
}
.row-1{
width: 5%;
}
.row-2{
width: 15%;
}
.row-3{
width: 15%;
min-width:210px;
}
.row-4{
width: 15%;
}
.row-5{
width: 25%;
}
.row-6{
width: 25%;
}
</style>
</head>
<body>
<?php
$n=0;
$l=100;
$i=0;
if(isset($_GET["n"])){
if(!empty($_GET["n"])){
$n=strtoupper(filter_input(INPUT_GET, 'n', FILTER_SANITIZE_STRING,
FILTER_FLAG_STRIP_HIGH));
$i=$n;
}else{
}
}else{
}
if(isset($_GET["l"])){
if(!empty($_GET["l"])){
$l=strtoupper(filter_input(INPUT_GET, 'l', FILTER_SANITIZE_STRING,
FILTER_FLAG_STRIP_HIGH));
}else{
}
}else{
}
$peik="";
if(isset($_POST["eik"])){
if(!empty($_POST["eik"])){
$peik=strtolower(filter_input(INPUT_POST, 'eik', FILTER_SANITIZE_STRING,
FILTER_FLAG_STRIP_HIGH));
}else{
}
}else{
}
$purl_final="";
if(isset($_POST["url_final"])){
if(!empty($_POST["url_final"])){
$purl_final=strtolower(filter_input(INPUT_POST, 'url_final',
FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH));
}else{
}
}else{
}
?>
<form method="get" action="<?php echo $_SERVER['PHP_SELF'];?>">
<input type="submit" value="Покжи">
<label> след запис </label>
<input type="text" name="n" size="6" value="<?php echo $n;?>">
<label> следващите </label>
<input type="text" name="l" size="6" value="<?php echo $l;?>">
<label> записа. </label>
</form>
<br/>
<a href="<?php echo $_SERVER['PHP_SELF']."?n=".($n-$l)."&l=".$l;?>">Предишна</a>
<a href="<?php echo $_SERVER['PHP_SELF']."?n=".($n+$l)."&l=".$l;?>">Следваща</a>
<br/>
<br/>
<?php
$link = mysql_connect('localhost', '', '')
or die('Няма връзка с базата данни ');
mysql_select_db('essbigdata') or die('Не съществува база данни, поискана от заявката ');
if($peik!==""){
$sql="UPDATE ikturl SET url_final='".$purl_final."' WHERE EIK='".$peik."';";
$result = mysql_query($sql);
}
$sql="SELECT * FROM ikturl LIMIT ".($n).", ".($l)."";
$result = mysql_query($sql);
if($result){
echo "<table>";
echo "<tr>";
echo "<th class=\"row-1\">Номер</th>";
echo "<th class=\"row-2\">СБР Инфо</th>";
echo "<th class=\"row-3\">Финал</th>";
echo "<th class=\"row-4\">Проверен</th>";
// echo "<th>jabse</th>";
// echo "<th>jabse може би</th>";
// echo "<th>jabse лат</th>";
// echo "<th>jabse лат може би</th>";
echo "<th class=\"row-5\">jabse може би json</th>";
// echo "<th>jabse лат може би json</th>";
echo "<th class=\"row-6\">google може би json</th>";
echo "</tr>";
while ($row = mysql_fetch_assoc($result)) {
$i++;
echo "<tr>";
echo "<td>".$i."</td>";
echo "<td>".$row["NAME"]."<br/>".$row["EIK"]."<br/>".$row["e_mail"]."<br/><a
href=\"http://".$row["Web"]."\" target=\"balnk\">".$row["Web"]."</a>";
echo
"<br/>".$row["KID4_08"]."<br/>".$row["OBL"]."<br/>".$row["adres_kontakt"]."<br/>".$row["telefo
n"]."<br/>".$row["fax"]."<br/>".$row["GSM"]."</td>";
// $url_final="";
// if($row["url_final"]!==""){$url_final=$row["url_final"];}
echo "<td>".urlpost_final($row["EIK"],$row["url_final"])."</td>";
echo "<td><a href=\"http://".$row["url"]."\"
target=\"balnk\">".$row["url"]."</a>".urlpost($row["EIK"],$row["url"])."</td>";
// echo "<td><a href=\"http://".$row["url_jabse"]."\"
target=\"balnk\">".$row["url_jabse"]."</a>".urlpost($row["EIK"],$row["url_jabse"])."";
// echo "<br/><a href=\"http://".$row["url_jabse_lat"]."\"
target=\"balnk\">".$row["url_jabse_lat"]."</a>".urlpost($row["EIK"],$row["url_jabse_lat"])."</td>";
// echo "<td>".jabse_split($row["url_jabse_maybe"],$row["EIK"])."";
// echo "<br/>".jabse_split($row["url_jabse_lat_maybe"],$row["EIK"])."</td>";
echo "<td>".jabse_json($row["url_jabse_maybe_json"],$row["EIK"])."";
echo "<br/>".jabse_json($row["url_jabse_lat_maybe_json"],$row["EIK"])."</td>";
echo "<td>".google_json($row["url_google_meybe"],$row["EIK"])."</td>";
?>
<!--
<td>
<a href="google_search.php?q=<?php echo $row["NAME"];?>" target="gse">Google Custome
Search</a>
</td>
-->
<?php
echo "</tr>";
}
echo "</table>";
}
function jabse_split($str,$eik){
$r="";
$arr=split(",",$str);
foreach($arr as $v){
if($v!==""){
$r.="<div><a href=\"".$v."\" target=\"balnk\">".$v."</a>";
$r.=urlpost($eik,$v);
$r.="</div>";
}
}
return $r;
}
function jabse_json($str,$eik){
$r="";
$str=str_replace("<maybe>","<div>",$str);
$str=str_replace("</maybe>","</div>",$str);
$str=str_replace("<title>","<br/>",$str);
$str=str_replace("</title>","",$str);
$str=str_replace("<text>","<br/>",$str);
$str=str_replace("</text>","",$str);
preg_match_all("/\<url\>\S+\<\/url\>/i", $str, $matches);
foreach($matches as $m1){
foreach($m1 as $m){
$url=str_replace("<url>","",$m);
$url=str_replace("</url>","",$url);
$r=urlpost($eik,$url);
$str=str_replace($m,"<a href=\"".$url."\" target=\"blank\">".$url."</a>".$r,$str);
}
}
return $str;
}
function google_json($str,$eik){
$r="";
$str=explode('"items": [',$str);
$str=explode('"title": "',$str[1]);
for($i=1;$i<sizeof($str)+1;$i++){
$r.="<div>";
$t=explode('",',$str[$i]);
$t=$t[0];
$l=explode('"link": "',$str[$i]);
$l=explode('"',$l[1]);
$r.="<a href=\"".$l[0]."\" target=\"blank\">".$l[0]."</a>";
$r.=urlpost($eik,$l[0]);
$r.="<br/>".$t;
$s=explode('"snippet": "',$str[$i]);
$s=explode('"',$s[1]);
$r.="<br/>".$s[0];
$r.="</div>";
}
return $r;
}
function urlpost($eik,$v){
$r="";
if($v!==""){
$r="<form method=\"post\" action=\"".$_SERVER["REQUEST_URI"]."\">
<input type=\"hidden\" name=\"eik\" value=\"".$eik."\">
<input type=\"hidden\" name=\"url_final\" value=\"".$v."\">
<input type=\"submit\" value=\"Това е\">
</form>";
}
return $r;
}
function urlpost_final($eik,$v){
$r="";
$r="<form method=\"post\" action=\"".$_SERVER["REQUEST_URI"]."\">
<input type=\"hidden\" name=\"eik\" value=\"".$eik."\">
<input type=\"text\" name=\"url_final\" value=\"".$v."\" size=\"30\">
<input type=\"submit\" value=\"Това е\">
</form>";
return $r;
}
mysql_close($link);
?>
<br/>
<a href="<?php echo $_SERVER['PHP_SELF']."?n=".($n-$l)."&l=".$l;?>">Предишна</a>
<a href="<?php echo $_SERVER['PHP_SELF']."?n=".($n+$l)."&l=".$l;?>">Следваща</a>
</body>
</html>