Index de l'article

Script final

Voici un exemple de script complet facilement adaptable pour des sites de structure similaire (1 page  = 1 enregistrement et URL suffixée par un identifiant chiffré) :

<?php
include_once('simple_html_dom.php');
 
// Connexion MySQL
$bdd = new PDO('mysql:host=localhost; dbname=extractor; charset=utf8', 'root', '');
 
// Boucle gérant les pages html
$nb_lignes = 0 ;
while($nb_lignes<=10000)
{
// Préfixe de l'URL
$content = 'http://site-indiscret.com/contacts/' ;
 
// URL incrémentée
$content_ = str_get_html($content.$nb_lignes++) ;
 
// Création d'un contexte avant l'appel de la page afin de pouvoir ignorer les erreurs d'import
$context = stream_context_create(array(
'http' => array('ignore_errors' => true),
));
 
// Appel de la page
$html = @file_get_html($content_, false, $context) ;
 
// Appel des contenus
foreach($html->find('.org') as $society) ;
foreach($html->find('.fn') as $name) ;
foreach($html->find('.street-address') as $address) ;
foreach($html->find('.email') as $email) ;
foreach($html->find('.grill-row .value') as $category) ;
foreach($html->find('.tel') as $phone) ;
 
// Corrections de champs
$society_ = $society->plaintext ;
$society_ = str_replace("\n"," ",$society_); $society_ = str_replace("\r"," ",$society_); $society_ = str_replace("\t"," ",$society_);
$society_ = str_replace('"',' ',$society_); $society_ = str_replace(';',' ',$society_); $society_ = str_replace(' ',' ',$society_);
$society_ = trim($society_); $society_ = mb_convert_case($society_, MB_CASE_TITLE) ;
 
$address_ = $address->plaintext ;
$address_ = str_replace("\n"," ",$address_); $address_ = str_replace("\r"," ",$address_); $address_ = str_replace("\t"," ",$address_);
$address_ = str_replace('"',' ',$address_); $address_ = str_replace(';',' ',$address_); $address_ = str_replace(' ',' ',$address_);
$address_ = trim($address_); $address_ = mb_convert_case($address_, MB_CASE_TITLE) ;
 
$email_ = $email->plaintext ;
$email_ = str_replace("\n"," ",$email_); $email_ = str_replace("\r"," ",$email_); $email_ = str_replace("\t"," ",$email_);
$email_ = trim($email_); $email_ = strtolower($email_);
 
$category_ = $category->plaintext ;
$category_ = str_replace("\n"," ",$category_); $category_ = str_replace("\r"," ",$category_); $category_ = str_replace("\t"," ",$category_);
$category_ = str_replace('"',' ',$category_); $category_ = str_replace(';',' ',$category_); $category_ = str_replace(' ',' ',$category_);
$category_ = trim($category_); $category_ = mb_convert_case($category_, MB_CASE_TITLE) ;
 
$phone_ = $phone->plaintext ;
$phone_ = str_replace("\n"," ",$phone_); $phone_ = str_replace("\r"," ",$phone_); $phone_ = str_replace("\t"," ",$phone_);
$phone_ = str_replace('"',' ',$phone_); $phone_ = str_replace(';',' ',$phone_); $phone_ = str_replace(' ',' ',$phone_);
$phone_ = trim($phone_);
 
// Préparation de la requête
$req = $bdd->prepare("INSERT INTO agencebio (society, name, address, email1, category, phone)
VALUES (:society, :name, :address, :email, :category, :phone)");
 
// Exécution de la requête
$req->execute(array(
 
// Variables contenues
"society" => $society_,
"name" => $name_,
"address" => $address_,
"email" => $email_,
"category" => $category_,
"phone" => $phone_,
 
)
);
 
// Fin de la boucle gérant les pages html
}
 
?>

Le script PHP intègre quelques corrections de champs. Adaptez-les là encore à vos besoins. La gestion du HTML bien sûr (suppression des balises, plaintext) mais aussi la gestion des tabulations, sauts de lignes...

Et voilà ! Exécutez maintenant votre script (en vous rendant à son URL via un navigateur), il va aspirer les données balisées de chaque page testée dans vos champs BDD.

Modifiez la boucle while et vos temps d'exécution maximum pour syphonner toutes les données qu'il vous faut.