Here is now the IMDB_[EN][HTTPS]_TEST_Aka script, which uses Python and Selenium scripts to download all Aka titles from the Imdb Release info website.
Using a Python Selenium script, a complete web page with all Release Date and Also Known As (AKA) data or records is downloaded to the downpage-UTF8_NO_BOM.htm file. Not just the max 6 visible records when you open the Imdb Release info website.
This IMDB_[EN][HTTPS]_TEST_Aka script is largely the same as any IMDB_[EN][HTTPS] script, but there are a few changes, which are listed below.
Here are the changes to the IMDB_[EN][HTTPS]_TEST_Aka script:
Function DownloadPage(URL:AnsiString):String; //BlockOpen
//Returns the URL page text. If error returns empty string
Var
i:Integer;
ScriptPath,WebText:String;
Begin
LogMessage(Chr(9)+Chr(9)+'Function DownloadPage BEGIN======================|');
LogMessage(Chr(9)+Chr(9)+'Global Var-DownloadURL|'+DownloadURL+' |');
LogMessage(Chr(9)+Chr(9)+' Local Var-URL|'+URL+' |');
ScriptPath:=GetAppPath+'Scripts\';
//Delete the ancient downloaded page file.
While FileExists(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM) Do Begin
LogMessage(Chr(9)+Chr(9)+'Deleting existing file: ' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM);
FileExecute('cmd.exe', '/C del "'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+'"');
LogMessage(Chr(9)+Chr(9)+' Waiting 1s for delete:'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
wait (1000);
End;
// Download the URL page.
//LogMessage(Chr(9)+Chr(9)+' Download with PVdBDownPage in file:|'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+' the information of:|'+URL+' ||');
//FileExecute(ScriptPath+'PVdBDownPage.exe', '"'+URL+'" "'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+'"');
LogMessage(Chr(9) + Chr(9) + ' Download with Selenium in file:| ' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + ' the information of:|' + URL + '||');
LogMessage(Chr(9)+Chr(9)+'Executing Python script to download URL content.');
FileExecute('python.exe', '"' + ScriptPath + 'selenium_script.py" "' + URL + '" "' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + '"');
// Wait download finish and exist the downloaded page.
i:=0; // INTERNET_TEST_ITERATIONS
While Not(FileExists(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM)) Do Begin
LogMessage(Chr(9)+Chr(9)+' Waiting 2s for exists of:'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
wait (5000);
(*
i:=i+1;
If i=INTERNET_TEST_ITERATIONS Then Begin
if 2=MessageBox('Too many faulty attempts to internet connection.'+Chr(13)+ 'Retry or Cancel?',SCRIPT_NAME,5) then begin
LogMessage(Chr(9)+Chr(9)+'Function DownloadPage END with NOT INTERNET connection ===============|');
Result:='';
Exit;
End;
i:=0;
End;
*)
End;
LogMessage(Chr(9)+Chr(9)+' Now present complete page file: '+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
WebText:=FileToString(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
LogMessage(Chr(9)+Chr(9)+'File content length: ' + IntToStr(Length(WebText)));
LogMessage(Chr(9)+Chr(9)+'File content (first 100 chars): ' + Copy(WebText, 1, 100));
WebText:=ConvertEncoding(WebText, 65001);
Result:=WebText;
// Some download data validations.
if (Pos('404 Not Found',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL is not in use (404 Not Found).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' 404 Not Found|');
Result:='';
End;
if (Pos('404 Error - IMDb',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL is not in use (404 Error - IMDb).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' 404 Error - IMDb|');
Result:='';
End;
if (Pos('Page not found',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL is not in use (Page not found).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' Page not found|');
Result:='';
End;
if (Pos('405 Method not allowed',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL has HTTP method problems (405 Method not allowed).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' 405 Method not allowed|');
Result:='';
End;
if (Pos('Too many request',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The provider has banned your IP (Too many request).'+Chr(13)+'Go to the provider web and resolve the captcha in order to prove you are not a robot',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' Banned IP|');
Result:='';
End;
LogMessage('Value BASE_DOWNLOAD_FILE_NO_BOM: ' + BASE_DOWNLOAD_FILE_NO_BOM);
LogMessage(Chr(9)+Chr(9)+'Function DownloadPage END======================|');
exit;
End; //BlockClose
Function DownloadImage(URL:AnsiString;OutPutFile:AnsiString):Integer; //BlockOpen
//Returns 1 or 0 if the downloaded image file exists in Exit.
//Var
//i:Integer;
//ScriptPath:String;
Begin
(*
LogMessage(Chr(9)+Chr(9)+'Function DownloadImage BEGIN======================|');
LogMessage(Chr(9)+Chr(9)+'Global Var-DownloadURL|'+DownloadURL+' |');
LogMessage(Chr(9)+Chr(9)+' Local Var-URL|'+URL+' |');
LogMessage(Chr(9)+Chr(9)+' Local Var-OutPutFile|'+OutPutFile+'|');
ScriptPath:=GetAppPath+'Scripts\';
//Delete the ancient dowloaded page file. Needed for wait to curl download included in PowerShell command.
While FileExists(OutPutFile) Do Begin
FileExecute('cmd.exe', '/C del "'+OutPutFile+'"');
LogMessage(Chr(9)+Chr(9)+' Waiting 1s for delete:'+OutPutFile);
wait (1000);
End;
//Download the URL page.
LogMessage(Chr(9)+Chr(9)+' Download with PVdBDownPage in file:|'+OutPutFile+' the information of:|'+URL+' ||');
FileExecute(ScriptPath+'PVdBDownPage.exe', '"'+URL+'" "'+OutPutFile+'"');
//Wait download finish and exist the downloaded page.
i:=0; // INTERNET_TEST_ITERATIONS
While Not(FileExists(OutPutFile)) Do Begin
LogMessage(Chr(9)+Chr(9)+' Waiting 2s for exists of:'+OutPutFile);
wait (2000);
i:=i+1;
If i=INTERNET_TEST_ITERATIONS Then Begin //In the images download the scritp can not ask to the user for internet conexion because perhaps the file doesn't exist.
LogMessage(Chr(9)+Chr(9)+'Function DownloadImage END with NOT file downloaded ===============|');
Result:=0;
exit;
End;
End;
LogMessage(Chr(9)+Chr(9)+' Now present complete page file: '+OutPutFile);
Result:=1;
LogMessage(Chr(9)+Chr(9)+'Function DownloadImage END======================|');
exit;
*)
End; //BlockClose
//(*
function CustomStringReplace(const Source: string; const OldPattern: array of string; const NewPattern: array of string): string;
var
i: Integer;
ResultString: string;
begin
ResultString := Source;
for i := Low(OldPattern) to High(OldPattern) do
begin
ResultString := StringReplace(ResultString, OldPattern, NewPattern, True, False, True);
end;
Result := ResultString;
end;
//*)
It is only Function ParsePage_IMDBMovieAKA for best performance.
Also in Function ParsePage, the code for best performance is like this at the end.
//Get ~url~
if (0=Pos(BASE_URL_PRE,StoredURL)) then begin //Write the url if not exists
AddFieldValueXML('url',StringReplace(DownloadURL,BASE_URL_PRE_TRUE,BASE_URL_PRE,True,False,False));
LogMessage(' Get result url:'+StringReplace(DownloadURL,BASE_URL_PRE_TRUE,BASE_URL_PRE,True,False,False)+' ||');
end;
//Parse Also Known As provider page = BASE_URL_AKA-------------------------------------------------------------------
If (GET_FULL_AKA and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
//If (GET_FULL_AKA and (MediaType='Movie') and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
//If (GET_FULL_AKA and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
DownloadURL:=StringReplace(BASE_URL_AKA,'%IMDB_ID',MovieID,True,True,False);
HTML:=DownloadPage(DownloadURL); //True page for parsing
LogMessage('Length of the read HTML: ' + IntToStr(Length(HTML)));
HTML:=HTMLToText(HTML);
ResultTmp:=ParsePage_IMDBMovieAKA(HTML);
If Not(ResultTmp=prFinished) then Result:=ResultTmp;
End;
//*)
//Date ~Updated~ (choose simple or verbose version)
Date:=DateToStr(CurrentDateTime);
ExplodeString(Date,DateParts,'-');
Date:=DateParts[2]+'.'+ DateParts[1]+'.'+DateParts[0];
Date := CustomStringReplace(Date, ['01.', '02.', '03.', '04.', '05.', '06.', '07.', '08.', '09.'], ['1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.']);
AddCustomFieldValueByName('Updated',Date); // Simple
//AddCustomFieldValueByName('Updated0',Date+' at '+TimeToStr(CurrentDateTime)); // Verbose
//AddCustomFieldValueByName('IUpdated',Date+' at '+TimeToStr(CurrentDateTime)+' • '+SCRIPT_NAME+' '+SCRIPT_VERSION); // Annoying
LogMessage(' Provider data info retreived Ok in '+DateToStr(CurrentDateTime)+' '+TimeToStr(CurrentDateTime)+'| (~Updated~)');
Mode:=smFinished;
LogMessage('Function ParsePage smNormal END======================|');
Exit;
End;
//Parse with the Person URL 'smFinished'------------------------------------------------------------------------------------
If (Mode=smFinished) Then Begin //Needed because the PHOTO_DWN_RONDABOUT
Mode:=smFinished;
LogMessage('Function ParsePage smFinished END======================|');
LogMessage('Function ParsePage END======================| Mode: ' + IntToStr(Mode));
Result:=prFinished;
Exit;
End;
//Parse with unknow mode-----------------------------------------------------------------------------------------------
Result:=prError;
Exit;
End; //BlockClose
//OBLIGATORY CALLBACK PRINCIPAL FUNCTION=========================================================================BlockClose
More instructions for how to run the script can be found at the link below.
http://www.videodb.info/forum_en/index.php/topic,4362.0.htmlPython Selenium script
Python Selenium script, which is used together with IMDB_[EN][HTTPS]_TEST_Aka script is added here. Python Selenium script unzip and add it to the Scripts folder of the program. Instructions for use are in the first post above.
Python Selenium script is added.
Python Selenium script is at the link below.
http://www.videodb.info/forum_en/index.php/topic,4362.msg22691.html#msg22691