In this particular case , i have used java programming ,maven as build tools and HtmlUnit library to scrap some data form hamrobazar.com and Apache poi library write to excel files.
collect the all the categories name , sub-categories and grab all the items details of basepage suppose(name,date price,seller name...etc). after collecting all the data keep the data one array-List ,write to excel files as categories name should be excel file name ,sub- categories should be excel sheet name at last put the items details in each sheet .
example:
excel data structure:
Item name address date description imageurl status sellerName imageDirecotry
solution =>
package Scrapper;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.*;
import model.Category;
import model.Item;
import model.SubCategory;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Scrapper1 {
public static List<Category> categories = new ArrayList<Category>();
public static String BaseUrl = "https://hamrobazaar.com/";
public static void allCategories() {
WebClient webClient = new WebClient();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page = webClient.getPage(BaseUrl);
HtmlTable htmlTable = (HtmlTable) page.getByXPath("//*[@id=\"tab_cat1\"]/table").get(0);
for (final HtmlTableRow row : htmlTable.getRows()) {
for (final HtmlTableCell cell : row.getCells()) {
HtmlAnchor htmlAnchor = (HtmlAnchor) cell.getFirstChild().getFirstChild();
Category category = new Category();
category.setCategoryName(htmlAnchor.getTextContent());
category.setSubCategories(grabAllSubCategories(htmlAnchor.getHrefAttribute()));
categories.add(category);
System.out.println(categories);
}
}
//anchor.getTextContent();
} catch (Exception e) {
System.out.println(e);
}
}
private static List<SubCategory> grabAllSubCategories(String categoryHref) {
List<SubCategory> subCategories = new ArrayList<SubCategory>();
WebClient webClient = new WebClient();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page1 = webClient.getPage(BaseUrl + categoryHref);
try {
HtmlTable htmlTable = (HtmlTable) page1.getByXPath("//*[@id=\"tab_cat1\"]/table").get(0);
for (final HtmlTableRow row : htmlTable.getRows()) {
for (final HtmlTableCell cell : row.getCells()) {
HtmlAnchor htmlAnchor = (HtmlAnchor) cell.getFirstChild().getFirstChild();
String categories = htmlAnchor.getTextContent();
SubCategory subCategory = new SubCategory();
subCategory.setName(htmlAnchor.getTextContent());
subCategory.setItemList(grabAllItems(htmlAnchor.getHrefAttribute()));
subCategories.add(subCategory);
}
}
} catch (Exception e) {
e.printStackTrace();
}
} catch (Exception e) {
System.out.println(e);
}
return subCategories;
}
private static List<Item> grabAllItems(String subhrefAttribute) {
List<Item> items = new ArrayList<Item>();
WebClient webClient = new WebClient();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page1 = webClient.getPage(BaseUrl + subhrefAttribute);
HtmlTableCell htmlTableCell = (HtmlTableCell) page1.getByXPath("/html/body/table/tbody/tr[2]/td/table/tbody/tr[1]/td/table[5]/tbody/tr/td[2]").get(0);
try {
for (DomElement domElement : htmlTableCell.getChildElements()) {
try {
for (HtmlElement htmlElement : domElement.getHtmlElementDescendants()) {
List<HtmlTableCell> htmlTableCells = ((HtmlTableBody) htmlElement).getRows().get(0).getCells();
HtmlTableCell imageCell = htmlTableCells.get(1);
HtmlTableCell detailsCell = htmlTableCells.get(2);
HtmlTableCell dateCell = htmlTableCells.get(3);
HtmlTableCell priceCell = htmlTableCells.get(4);
String itemName = ((HtmlAnchor) detailsCell.getHtmlElementsByTagName("a").get(0)).getTextContent();
String sellerName = ((HtmlAnchor) detailsCell.getHtmlElementsByTagName("a").get(1)).getTextContent();
String imageUrl = ((HtmlImage) imageCell.getHtmlElementsByTagName("img").get(0)).getSrcAttribute();
String price = ((HtmlElement) priceCell.getHtmlElementsByTagName("b").get(0)).getTextContent();
String status = priceCell.getTextContent().substring(10, priceCell.getTextContent().indexOf(")"));
String description = detailsCell.getTextContent()
.substring(0, detailsCell.getTextContent().indexOf("Seller"))
.replace(itemName, "");
Item item = new Item();
item.setDescription(description);
item.setImageUrl(imageUrl);
item.setItemName(itemName);
item.setStatus(status);
// item.setAddress(address);
item.setDate(dateCell.getTextContent());
item.setSellerName(sellerName);
item.setItemPrice(price);
items.add(item);
System.out.println(items);
}
} catch (Exception ignored) {
}
}
} catch (Exception e) {
System.out.println(e);
}
} catch (Exception e) {
System.out.println(e);
}
return items;
}
//wrtting to excel files
public static void writeToXls() {
try {
String excelDir = "C:/Users/tej ghising/Desktop/office/";
for (int a = 0; a < categories.size(); a++) {
XSSFWorkbook wb = new XSSFWorkbook();
System.out.println(categories.get(a).getCategoryName());
for (int b = 0; b < categories.get(a).getSubCategories().size(); b++) {
Sheet sheet = wb.createSheet(categories.get(a).getSubCategories().get(b).getName());
Row rowHeading = sheet.createRow(b);
rowHeading.createCell(0).setCellValue("Item name");
rowHeading.createCell(1).setCellValue("address");
rowHeading.createCell(2).setCellValue("date");
rowHeading.createCell(3).setCellValue("description");
rowHeading.createCell(4).setCellValue("imageurl");
rowHeading.createCell(5).setCellValue("status");
rowHeading.createCell(6).setCellValue("sellerName");
rowHeading.createCell(7).setCellValue("imageDirecotry");
sheet.autoSizeColumn(b);
System.out.println(categories.get(a).getSubCategories().get(b).getName());
for (int i = 1; i < categories.get(a).getSubCategories().get(b).getItemList().size(); i++) {
Item obj = categories.get(a).getSubCategories().get(b).getItemList().get(i);
Row row = sheet.createRow(i);
Cell cell1 = row.createCell(0);
cell1.setCellValue(obj.getItemName());
Cell cell2 = row.createCell(1);
cell2.setCellValue(obj.getAddress());
Cell cell3 = row.createCell(2);
cell3.setCellValue(obj.getDate());
Cell cell4 = row.createCell(3);
cell4.setCellValue(obj.getDescription());
Cell cell5 = row.createCell(4);
cell5.setCellValue(obj.getImageUrl());
Cell cell6 = row.createCell(5);
cell6.setCellValue(obj.getStatus());
Cell cell7 = row.createCell(6);
cell7.setCellValue(obj.getSellerName());
}
}
FileOutputStream outputStream = new FileOutputStream(new File(excelDir + categories.get(a).getCategoryName() + ".xlsx"));
wb.write(outputStream);
}
} catch (Exception e) {
System.out.println(e);
}
}
public static void main(String[] args) throws IOException {
//calling methods
allCategories();
writeToXls();
}
}
0 Comments