From c9e59dfce20d10778d1455fb80f47d886a320cf3 Mon Sep 17 00:00:00 2001 From: Renan Bernordi Date: Tue, 24 Dec 2024 13:35:20 -0300 Subject: [PATCH] removido logs e liberado selenium como ultimo fallback --- TESTED_URLS.md | 2 ++ app/data/blocked_domains.php | 1 + app/inc/URLAnalyzer.php | 17 ++++++++++++++--- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/TESTED_URLS.md b/TESTED_URLS.md index 27b7096..543db23 100644 --- a/TESTED_URLS.md +++ b/TESTED_URLS.md @@ -39,6 +39,8 @@ https://oantagonista.com.br/brasil/lewandowski-insiste-na-pec-da-seguranca/ https://jornaldebrasilia.com.br/noticias/politica-e-poder/lula-aguarda-pt-para-troca-em-pastas-chefiadas-por-petistas-em-reforma-ministerial/ https://opopular.com.br/cidades/ex-secretario-de-saude-de-goiania-deixa-hospital-e-volta-para-a-cadeia-1.3207162 https://www.cartacapital.com.br/politica/surpresa-natalina/ +https://seucreditodigital.com.br/123milhas-devera-apresentar-plano-de-recuperacao-ainda-este-mes/ +https://www.matinaljornalismo.com.br/matinal/reportagem-matinal/vazao-guaiba-porto-alegre/ ## Internacional https://www.nytimes.com/2024/11/20/us/politics/matt-gaetz-venmo-payments-sex.html diff --git a/app/data/blocked_domains.php b/app/data/blocked_domains.php index 73ece58..3d0fcb4 100644 --- a/app/data/blocked_domains.php +++ b/app/data/blocked_domains.php @@ -16,6 +16,7 @@ 'utppublishing.com', 'chronicle.com', 'nexojornal.com', + 'nexojornal.com.br', 'lesoir.be', 'weeklytimesnow.com.au', 'barrons.com', diff --git a/app/inc/URLAnalyzer.php b/app/inc/URLAnalyzer.php index 13c37c6..1166667 100644 --- a/app/inc/URLAnalyzer.php +++ b/app/inc/URLAnalyzer.php @@ -123,7 +123,6 @@ public function analyze($url) $host = preg_replace('/^www\./', '', $host); if (in_array($host, BLOCKED_DOMAINS)) { - Logger::getInstance()->log($cleanUrl, 'BLOCKED_DOMAIN'); throw new Exception('Este domínio está bloqueado para extração.'); } @@ -153,7 +152,7 @@ public function analyze($url) return $processedContent; } } catch (Exception $e) { - Logger::getInstance()->log($cleanUrl, 'DIRECT_FETCH_ERROR', $e->getMessage()); + error_log("DIRECT_FETCH_ERROR: " . $e->getMessage()); } // 6. Tenta buscar do Wayback Machine como fallback @@ -165,7 +164,19 @@ public function analyze($url) return $processedContent; } } catch (Exception $e) { - Logger::getInstance()->log($cleanUrl, 'WAYBACK_FETCH_ERROR', $e->getMessage()); + error_log("WAYBACK_FETCH_ERROR: " . $e->getMessage()); + } + + // 7. Tenta buscar com Selenium como fallback + try { + $content = $this->fetchFromSelenium($cleanUrl, 'firefox'); + if (!empty($content)) { + $processedContent = $this->processContent($content, $host, $cleanUrl); + $this->cache->set($cleanUrl, $processedContent); + return $processedContent; + } + } catch (Exception $e) { + error_log("SELENIUM_ERROR: " . $e->getMessage()); } Logger::getInstance()->log($cleanUrl, 'GENERAL_FETCH_ERROR');