Author Topic: IMDB_[EN][HTTPS]_TEST_Aka (+Selenium) script  (Read 106 times)

0 Members and 3 Guests are viewing this topic.

Offline Ivek23

  • Global Moderator
  • *****
  • Posts: 2767
    • View Profile
IMDB_[EN][HTTPS]_TEST_Aka (+Selenium) script
« on: December 26, 2024, 05:47:16 pm »
Here is now the IMDB_[EN][HTTPS]_TEST_Aka script, which uses Python and Selenium scripts to download all Aka titles from the Imdb Release info website.

Using a Python Selenium script, a complete web page with all Release Date and Also Known As (AKA) data or records is downloaded to the downpage-UTF8_NO_BOM.htm file. Not just the max 6 visible records when you open the Imdb Release info website.

This IMDB_[EN][HTTPS]_TEST_Aka script is largely the same as any IMDB_[EN][HTTPS] script, but there are a few changes, which are listed below.

Here are the changes to the IMDB_[EN][HTTPS]_TEST_Aka script:

Quote
Function DownloadPage(URL:AnsiString):String; //BlockOpen
//Returns the URL page text. If error returns empty string
  Var
    i:Integer;
    ScriptPath,WebText:String;
    Begin
    LogMessage(Chr(9)+Chr(9)+'Function DownloadPage BEGIN======================|');
    LogMessage(Chr(9)+Chr(9)+'Global Var-DownloadURL|'+DownloadURL+' |');   
    LogMessage(Chr(9)+Chr(9)+'      Local Var-URL|'+URL+' |');
    ScriptPath:=GetAppPath+'Scripts\';
    //Delete the ancient downloaded page file.
    While FileExists(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM) Do Begin
         LogMessage(Chr(9)+Chr(9)+'Deleting existing file: ' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM);
       FileExecute('cmd.exe', '/C del "'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+'"');
         LogMessage(Chr(9)+Chr(9)+'      Waiting 1s for delete:'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
         wait (1000);
    End;

    // Download the URL page.
    //LogMessage(Chr(9)+Chr(9)+'      Download with PVdBDownPage in file:|'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+' the information of:|'+URL+' ||');
    //FileExecute(ScriptPath+'PVdBDownPage.exe', '"'+URL+'" "'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+'"');
   
    LogMessage(Chr(9) + Chr(9) + '      Download with Selenium in file:| ' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + ' the information of:|' + URL + '||');   
   LogMessage(Chr(9)+Chr(9)+'Executing Python script to download URL content.');
    FileExecute('python.exe', '"' + ScriptPath + 'selenium_script.py" "' + URL + '" "' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + '"');         
    // Wait download finish and exist the downloaded page.
    i:=0;   // INTERNET_TEST_ITERATIONS   
    While Not(FileExists(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM)) Do Begin
         LogMessage(Chr(9)+Chr(9)+'      Waiting 2s for exists of:'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
         wait (5000);
       (*   
         i:=i+1;
         If i=INTERNET_TEST_ITERATIONS Then Begin
            if 2=MessageBox('Too many faulty attempts to internet connection.'+Chr(13)+ 'Retry or Cancel?',SCRIPT_NAME,5) then begin
               LogMessage(Chr(9)+Chr(9)+'Function DownloadPage END with NOT INTERNET connection ===============|');
               Result:='';
               Exit;
            End;
            i:=0;
         End;
      *)
    End;

    LogMessage(Chr(9)+Chr(9)+'      Now present complete page file: '+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
    WebText:=FileToString(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
    LogMessage(Chr(9)+Chr(9)+'File content length: ' + IntToStr(Length(WebText)));
    LogMessage(Chr(9)+Chr(9)+'File content (first 100 chars): ' + Copy(WebText, 1, 100));      
    WebText:=ConvertEncoding(WebText, 65001); 
    Result:=WebText;

    // Some download data validations.
    if (Pos('404 Not Found',Result)>0) then begin 
        If BYPASS_SILENT Then ShowMessage('The URL is not in use (404 Not Found).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
        LogMessage(Chr(9)+Chr(9)+'      404 Not Found|');
        Result:='';
    End;

    if (Pos('404 Error - IMDb',Result)>0) then begin 
        If BYPASS_SILENT Then ShowMessage('The URL is not in use (404 Error - IMDb).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
        LogMessage(Chr(9)+Chr(9)+'      404 Error - IMDb|');
        Result:='';
    End;

    if (Pos('Page not found',Result)>0) then begin 
        If BYPASS_SILENT Then ShowMessage('The URL is not in use (Page not found).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
        LogMessage(Chr(9)+Chr(9)+'      Page not found|');
        Result:='';
    End;   

    if (Pos('405 Method not allowed',Result)>0) then begin 
        If BYPASS_SILENT Then ShowMessage('The URL has HTTP method problems (405 Method not allowed).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
        LogMessage(Chr(9)+Chr(9)+'      405 Method not allowed|');
        Result:='';
    End;
    if (Pos('Too many request',Result)>0) then begin 
        If BYPASS_SILENT Then ShowMessage('The provider has banned your IP (Too many request).'+Chr(13)+'Go to the provider web and resolve the captcha in order to prove you are not a robot',SCRIPT_NAME);
        LogMessage(Chr(9)+Chr(9)+'      Banned IP|');
        Result:='';
    End;

   LogMessage('Value BASE_DOWNLOAD_FILE_NO_BOM: ' + BASE_DOWNLOAD_FILE_NO_BOM);
    LogMessage(Chr(9)+Chr(9)+'Function DownloadPage END======================|');
    exit;
  End; //BlockClose

Quote
Function DownloadImage(URL:AnsiString;OutPutFile:AnsiString):Integer; //BlockOpen
//Returns 1 or 0 if the downloaded image file exists in Exit.
    //Var
    //i:Integer;
    //ScriptPath:String;
    Begin
   (*
    LogMessage(Chr(9)+Chr(9)+'Function DownloadImage BEGIN======================|');
    LogMessage(Chr(9)+Chr(9)+'Global Var-DownloadURL|'+DownloadURL+' |');   
    LogMessage(Chr(9)+Chr(9)+'      Local Var-URL|'+URL+' |');
    LogMessage(Chr(9)+Chr(9)+'      Local Var-OutPutFile|'+OutPutFile+'|');
    ScriptPath:=GetAppPath+'Scripts\';
    //Delete the ancient dowloaded page file. Needed for wait to curl download included in PowerShell command.
    While FileExists(OutPutFile) Do Begin
         FileExecute('cmd.exe', '/C del "'+OutPutFile+'"');
         LogMessage(Chr(9)+Chr(9)+'      Waiting 1s for delete:'+OutPutFile);
         wait (1000);
    End;
    //Download the URL page.
    LogMessage(Chr(9)+Chr(9)+'      Download with PVdBDownPage in file:|'+OutPutFile+' the information of:|'+URL+' ||');
    FileExecute(ScriptPath+'PVdBDownPage.exe', '"'+URL+'" "'+OutPutFile+'"');
    //Wait download finish and exist the downloaded page.
    i:=0;   // INTERNET_TEST_ITERATIONS
    While Not(FileExists(OutPutFile)) Do Begin
        LogMessage(Chr(9)+Chr(9)+'      Waiting 2s for exists of:'+OutPutFile);
        wait (2000);
        i:=i+1;
        If i=INTERNET_TEST_ITERATIONS Then Begin  //In the images download the scritp can not ask to the user for internet conexion because perhaps the file doesn't exist.
            LogMessage(Chr(9)+Chr(9)+'Function DownloadImage END with NOT file downloaded ===============|');
            Result:=0;
            exit;
         End;
    End;
    LogMessage(Chr(9)+Chr(9)+'      Now present complete page file: '+OutPutFile);
    Result:=1;
    LogMessage(Chr(9)+Chr(9)+'Function DownloadImage END======================|');
    exit;
    *)
  End; //BlockClose

Quote
//(*
function CustomStringReplace(const Source: string; const OldPattern: array of string; const NewPattern: array of string): string;
var
  i: Integer;
  ResultString: string;
begin
  ResultString := Source;
  for i := Low(OldPattern) to High(OldPattern) do
  begin
    ResultString := StringReplace(ResultString, OldPattern, NewPattern, True, False, True);
  end;
  Result := ResultString;
end; 
//*)

It is only Function ParsePage_IMDBMovieAKA for best performance.

Also in Function ParsePage, the code for best performance is like this at the end.

Quote
        //Get ~url~
        if (0=Pos(BASE_URL_PRE,StoredURL)) then begin   //Write the url if not exists
            AddFieldValueXML('url',StringReplace(DownloadURL,BASE_URL_PRE_TRUE,BASE_URL_PRE,True,False,False));
            LogMessage('      Get result url:'+StringReplace(DownloadURL,BASE_URL_PRE_TRUE,BASE_URL_PRE,True,False,False)+' ||');
        end;   

    //Parse Also Known As provider page = BASE_URL_AKA-------------------------------------------------------------------
        If (GET_FULL_AKA and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin   
        //If (GET_FULL_AKA and (MediaType='Movie') and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
        //If (GET_FULL_AKA and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin      
            DownloadURL:=StringReplace(BASE_URL_AKA,'%IMDB_ID',MovieID,True,True,False);
            HTML:=DownloadPage(DownloadURL);  //True page for parsing
         LogMessage('Length of the read HTML: ' + IntToStr(Length(HTML)));
            HTML:=HTMLToText(HTML);
            ResultTmp:=ParsePage_IMDBMovieAKA(HTML);
            If Not(ResultTmp=prFinished) then Result:=ResultTmp;
        End;
//*)         
    //Date ~Updated~ (choose simple or verbose version)
        Date:=DateToStr(CurrentDateTime);
        ExplodeString(Date,DateParts,'-');
        Date:=DateParts[2]+'.'+ DateParts[1]+'.'+DateParts[0];
      Date := CustomStringReplace(Date, ['01.', '02.', '03.', '04.', '05.', '06.', '07.', '08.', '09.'], ['1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.']);
        AddCustomFieldValueByName('Updated',Date); // Simple
        //AddCustomFieldValueByName('Updated0',Date+' at '+TimeToStr(CurrentDateTime)); // Verbose
        //AddCustomFieldValueByName('IUpdated',Date+' at '+TimeToStr(CurrentDateTime)+' • '+SCRIPT_NAME+' '+SCRIPT_VERSION);  // Annoying 
        LogMessage('    Provider data info retreived Ok in '+DateToStr(CurrentDateTime)+' '+TimeToStr(CurrentDateTime)+'| (~Updated~)');
        Mode:=smFinished;
        LogMessage('Function ParsePage smNormal END======================|');   
        Exit;      
    End;   
//Parse with the Person URL 'smFinished'------------------------------------------------------------------------------------
    If (Mode=smFinished) Then Begin //Needed because the PHOTO_DWN_RONDABOUT
        Mode:=smFinished;
        LogMessage('Function ParsePage smFinished END======================|');
      LogMessage('Function ParsePage END======================| Mode: ' + IntToStr(Mode));
        Result:=prFinished;   
        Exit;    
    End;
//Parse with unknow mode-----------------------------------------------------------------------------------------------   
    Result:=prError;
    Exit; 
End; //BlockClose
//OBLIGATORY CALLBACK PRINCIPAL FUNCTION=========================================================================BlockClose

More instructions for how to run the script can be found at the link below.

http://www.videodb.info/forum_en/index.php/topic,4362.0.html

Python Selenium script

Python Selenium script, which is used together with IMDB_[EN][HTTPS]_TEST_Aka script is added here. Python Selenium script unzip and add it to the Scripts folder of the program. Instructions for use are in the first post above.

Python Selenium script is added.

Python Selenium script is at the link below.

http://www.videodb.info/forum_en/index.php/topic,4362.msg22691.html#msg22691
« Last Edit: January 06, 2025, 06:48:37 pm by Ivek23 »
Ivek23
Win 10 64bit (32bit)   PVD v0.9.9.21, PVD v1.0.2.7, PVD v1.0.2.7 + MOD


 

anything