Download multiple pdf files from website

Hi Team,

I am using Katalon for the 1st time. Please bear with me.

Can anyone point me at steps on how to automatically download all pdf files from a website.

First you should try search this forum with key "download file". You will find plenty of previous posts to read.

hello,

check this

ok,
here is code how to download all .pdf links from the page

download Jsoup .jar from


and pdfBox .jar
https://pdfbox.apache.org/download.cgi

copy them to Katalon project Drivers folder

TESCASE:
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

//get all pdf links from page
def basicUrlrl = 'http://www.testingdiaries.com/selenium-webdriver-read-pdf-content/'
List<String> linksArray = new ArrayList<>()
linksArray = readAllPdfLinks(basicUrlrl)
println linksArray

//read pdf contents
String pdf = readPdfFile(linksArray.get(0)) // in this example only one .pdf link in page
println (pdf)
Assert.assertTrue(pdf.contains('Open the setting.xml, you can see it is like this:'))
Assert.assertTrue(pdf.contains('Please add the following sentence in setting.xml before'))
Assert.assertTrue(pdf.contains('You can see that I have modified the setting.xml, and if open the file in IE, it is like this:'))

/*
parameter basic url
return list of .pdf links
*/
public List<String> readAllPdfLinks(def url){
	
	List<String> linksArray = new ArrayList<>()
	
	Document doc = Jsoup.connect(url).get();
	print((doc.title()));

	Elements links = doc.select("a[href]");

	println("\nLinks: "+ links.size());
	for (Element link : links) {
		if (link.absUrl("href").contains(".pdf")) {
			println("a: "+link.attr("abs:href"));
			linksArray.add(link.attr("abs:href"))
		}
	}
	return linksArray
	
}

/*
parameter pdf url
return pdf content
*/
public String readPdfFile(String pdfUrl){
	
	URL TestURL = new URL(pdfUrl);
	BufferedInputStream bis = new BufferedInputStream(TestURL.openStream());
	PDDocument doc = PDDocument.load(bis);
	String pdfText = new PDFTextStripper().getText(doc);
	doc.close();
	bis.close();
	println(pdfText);
	return pdfText;
}
1 Like