728x90
Selenium
이번 프로젝트에서 기존 사이트의 게시판 글을 모두 마이그레이션 해달라는 요청있었다.
처음에는 DB로 데이터를 받을 수있는지 협의해봤는데 DB는 줄수 없다고 답변을 받았다.
그래서 크롤링으로 모두 마이그레이션 하려고 Selenium을 이용하였다.
Selenium은 실제 브라우저랑 연동이되어 크롤링 하지 못하는게 없는 강력한 소프트웨어이다.
C#, Ruby, Java, Python, JavaScript를 지원한다.
난 여기서 내가 쉽게 할 수 있는 Java를 택했다.
소스를 코딩한뒤에 실행하면 설정한 브라우저(크롬)이 하나 실행되고 그 브라우저를 바탕으로 크롤링이 된다.
해당 소스를 코딩뒤에 마이그레이션을 빌미로 잘 놀았던것 같다. ^^
아래는 내가 구현한 소스이며 더 디테이하게 구현가능하나 간단하게 필요한 부분만 끌거오게 구현하고 프로그램을 돌렸다.
일회성이라 특별한 기능은 없다.
package board;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.dataloader.Try;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.openqa.selenium.By;
import org.openqa.selenium.Pdf;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.print.PrintOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import board.service.MyBatisService;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.nio.file.Paths;
import io.opentelemetry.exporter.logging.SystemOutLogRecordExporter;
public class SeleniumTest {
public static void main(String[] args) {
try {
MyBatisService service = new MyBatisService();
List<Map<String,Object>> list = service.select();
for (Map<String,Object> dept : list) {
System.out.println(dept);
}
SeleniumTest selTest = new SeleniumTest();
//selTest.crawl();
selTest.crawl2();
} catch (Exception e) {
e.printStackTrace();
}
}
//WebDriver
private WebDriver driver;
//Properties
public static final String WEB_DRIVER_ID = "webdriver.chrome.driver";
public static final String WEB_DRIVER_PATH = "D:\\mig\\chromedriver_win32\\chromedriver.exe";
//크롤링 할 URL
private String base_url;
public SeleniumTest() {
super();
//System Property SetUp
System.setProperty(WEB_DRIVER_ID, WEB_DRIVER_PATH);
//Driver SetUp
ChromeOptions options = new ChromeOptions();
//options.addArguments("headless");
//driver = new ChromeDriver();
//options.setHeadless(true);
driver = new ChromeDriver(options);
//base_url = "https://www.naver.com";
//base_url = "https://www.ppomppu.co.kr/zboard/zboard.php?id=ppomppu";
base_url = "크롤링할 기본 url";
}
public void crawl2() throws IOException {
try {
//get page (= 브라우저에서 url을 주소창에 넣은 후 request 한 것과 같다)
driver.get(base_url);
try {Thread.sleep(30000);} catch (InterruptedException e) {}
driver.get("file:///C:/Users/ksmro/Desktop/img.html");
List<WebElement> els = driver.findElements(By.xpath("/html/body/a"));
int i = 0;
for(WebElement el : els) {
System.out.println(i++);
el.click();
try {Thread.sleep(1000);} catch (InterruptedException e) {}
}
}catch (Exception e) {
e.printStackTrace();
}
}
public void crawl() throws IOException {
InputStream in = null;
OutputStream out = null;
try {
//get page (= 브라우저에서 url을 주소창에 넣은 후 request 한 것과 같다)
driver.get(base_url);
try {Thread.sleep(10000);} catch (InterruptedException e) {}
int start = 1;
int end = 1;
String gubun = "구분";
String category = "카테고리";
String url = "크로링할 URL";
Map<String,Object> paramMap = new HashMap<String, Object>();
for(int i=start ; i <= end; i++) {
driver.get(url + i);
//String xpath = "//*[@id=\"53f2b5ad-c15f-482c-8380-75591d206874-a81109f3-bc61-4a8e-8f78-4074ed88811d\"]/tbody/tr";
//List<WebElement> els = driver.findElements(By.xpath(listXpath));
List<WebElement> els = driver.findElements(By.className("ms-itmhover"));
System.out.println(els.size());
//List<WebElement> els = driver.findElements(By.xpath("//*[@id=\"d5a214f5-f73f-403a-b4af-f17a8da580c4-0a2feff5-66d5-4555-8fac-935a012105de\"]/tbody/tr"));
ArrayList<Map<String,Object>> urlList = new ArrayList<Map<String,Object>>();
if(els.size() == 0) {
break;
}
var mainImgIdx = 0;
for(WebElement el : els) {
System.out.println(el.getAttribute("innerHTML"));
Map<String,Object> map = new HashMap<String, Object>();
//WebElement elImg= el.findElement(By.xpath("//span[@class = 'img_box ui-data-thumbnail']"));
//String main_img = elImg.findElement(By.tagName("img")).getAttribute("src");
//String main_img = el.findElements(By.tagName("img")).get(el.findElements(By.tagName("img")).size()-2).getAttribute("src");
String main_img = null;
//String main_img = el.findElement(By.xpath("/td/table/tbody/tr/td[2]/div/div["+(++i)+"]/span/img")).getAttribute("src");
//String main_img = el.findElement(By.className("img_thumbnail")).findElement(By.tagName("img")).getAttribute("src");
//category = null;
//try{
// category = el.findElements(By.className("ms-vb2")).get(1).getText();
//}catch (Exception e) {
// e.printStackTrace();
//}
//news
//String link = el.findElement(By.tagName("a")).getAttribute("href");
//code
String link = null;
try{
link = el.findElement(By.className("ui-anchor")).getAttribute("href");
}catch (Exception e) {
e.printStackTrace();
}
System.out.println("link : " + link);
if(link != null) {
System.out.println(link);
//urlList.add(link);
map.put("main_img", main_img);
map.put("link", link);
map.put("category", category);
urlList.add(map);
}
//driver.get(link);
//WebElement title = driver.findElement(By.xpath("//*[@id=\"part1\"]/div[3]/div[1]/div/p"));
//System.out.println(title);
//WebElement body = driver.findElement(By.xpath("//*[@id=\"part1\"]/div[3]/div[2]/div[1]/div/div"));
//System.out.println(body);
}
for(Map<String,Object> urlMap : urlList) {
String urlStr = (String)urlMap.get("link");
String main_img = (String)urlMap.get("main_img");
//String category = (String)urlMap.get("category");
//if("".equals(category)) {
// continue;
//}
//String uuid = urlStr.replace(idXpath, "");
String uuid = null;
System.out.println("id : " + uuid);
driver.get(urlStr);
WebElement webElementTitle = driver.findElement(By.xpath("//*[@id=\"part1\"]/div[3]/div[1]/div/p"));
String title = webElementTitle.getText();
System.out.println("제목 : " + title);
//WebElement webElementBody = driver.findElement(By.xpath("//*[@id=\"part1\"]/div[3]/div[2]/div[1]/div/div"));
WebElement webElementBody = null;
String body = null;
try {
webElementBody = driver.findElement(By.xpath("//*[@id=\"part1\"]/div[3]/div[2]/div[1]/div/div/div"));
body = webElementBody.getAttribute("innerHTML");
}catch (Exception e) {
e.printStackTrace();
body = "";
}
//String body = webElementBody.getAttribute("innerHTML");
System.out.println("내용 : " + body);
WebElement webElementCreateDate = driver.findElement(By.xpath("//*[@id=\"part1\"]/div[3]/div[1]/div/ul/li[2]/span"));
String create_date = webElementCreateDate.getText();
System.out.println("작성일 : " + create_date);
WebElement webElementUser = driver.findElement(By.xpath("//*[@id=\"ctl00_PlaceHolderMain_UFAuthor_Iink_Userlink\"]"));
String user_nm = webElementUser.getText();
String user_id = webElementUser.getAttribute("href").replace("javascript:fnUserInfo('", "").replace("');", "");
System.out.println("작성자 : " + user_nm);
System.out.println("user_id : " + user_id);
//Document doc = Jsoup.parse(body.getAttribute("innerHTML"));
//System.out.println(doc.html());
//image down
paramMap.put("uuid", uuid);
paramMap.put("title", title);
paramMap.put("user_id", user_id);
paramMap.put("user_nm", user_nm);
paramMap.put("create_date", create_date);
paramMap.put("body", body);
paramMap.put("main_img", main_img);
paramMap.put("category", category);
paramMap.put("gubun", gubun);
MyBatisService service = new MyBatisService();
System.out.println("paramMap : " + paramMap);
service.insertTmpBoard(paramMap);
//*[@id="{DC565DD2-0037-496B-87F2-0A3BA50535D0}"]
//*[@id="idAttachmentsTable"]/tbody//a
List<WebElement> files = driver.findElements(By.xpath("//*[@id=\"idAttachmentsTable\"]/tbody//a"));
for(WebElement file : files) {
String fileUrl = file.getAttribute("href");
String fileNm = file.getText();
System.out.println("file_url : " + fileUrl);
System.out.println("file_name : " + fileNm);
paramMap.put("file_url", fileUrl);
paramMap.put("file_nm", fileNm);
service.insertTmpFile(paramMap);
}
//*[@id="part1"]/div[3]/div[2]/div[1]/div/div/div/img
//List<WebElement> imgs = driver.findElements(By.xpath("//*[@id=\"part1\"]/div[3]/div[2]/div[1]/div/div/div/table/tbody/tr[2]//img"));
List<WebElement> imgs = driver.findElements(By.xpath("//*[@id=\"part1\"]/div[3]/div[2]/div[1]/div/div/div//img"));
for(WebElement img : imgs) {
String imgUrl = img.getAttribute("src");
System.out.println("imgUrl : " + imgUrl);
paramMap.put("img_url", imgUrl);
service.insertTmpImg(paramMap);
//driver.get(imgUrl);
//URL urlImage = new URL(img.getAttribute("src"));
//String fileName = Paths.get(urlImage.getPath()).getFileName().toString();
//BufferedImage saveImage = ImageIO.read(urlImage);
//download image to the workspace where the project is, save picture as picture.png (can be changed)
//ImageIO.write(saveImage, "jpg", new File("D:/mig/img/"+ fileName + ".jpg"));
//String USER = users.get(i).replace(".", "_");
//File dir = new File("D:/mig/img/" + id);
//if ( !dir.exists() ) {
// dir.mkdir();
//}
//
//in = urlImage.openStream();
//out = new FileOutputStream("D:/mig/img/" + id + "/" + fileName); //저장경로
//
//while(true){
// //이미지를 읽어온다.
// int data = in.read();
// if(data == -1){
// break;
// }
// //이미지를 쓴다.
// out.write(data);
//
//}
//
//in.close();
//out.close();
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(in != null){in.close();}
if(out != null){out.close();}
}
}
}
chromedriver_win32.zip
6.78MB
selenium.z01
17.00MB
selenium.zip
10.09MB
728x90
'Programming > JAVA' 카테고리의 다른 글
Json String to VO (0) | 2023.04.01 |
---|---|
이클립스 화면설계 플러그인 (0) | 2023.04.01 |
WIN10 ipconfig wsl ip tunneling (0) | 2023.03.29 |
Windows에서 JNA를 사용하여 Java에서 메모리를 조작하는 방법 (0) | 2023.03.29 |