<?php
$url = "http://ta.wikipedia.org/wiki/%E0%AE%AE%E0%AF%81%E0%AE%A4%E0%AE%B1%E0%AF%8D_%E0%AE%AA%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AE%AE%E0%AF%8D";
$string = file_get_contents($url);
// echo $string;
$string = mb_convert_encoding($string, 'HTML-ENTITIES', "UTF-8");
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->encoding = 'UTF-8';
$dom->loadHTML($string);
$mock = new DOMDocument();
$body = $dom->getElementsByTagName('body')->item(0);
foreach ($body->childNodes as $child){
$mock->appendChild($mock->importNode($child, true));
}
$html_content = $mock->saveHTML();
$html = preg_replace('#<script(.*?)>(.*?)</script>#is','', $html_content);
$html = preg_replace('#<style(.*?)>(.*?)</style>#is','', $html);
$cont = preg_replace("/<.*?>/","",$html);
echo $cont ;
?>
Related Tags : Php crawl hindi language html content , php crawl tamil html content , PHP crawl utf-8 content.
No comments:
Post a Comment