Получилось вот так, но все равно не работает
Код | import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import java.io.*;
import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; import java.awt.image.BufferedImage; import java.net.URL; import java.net.URLConnection; import javax.imageio.ImageIO; import javax.swing.text.AttributeSet; import javax.swing.text.html.HTMLDocument;
public class ExtractAllImages { static String result_doc = "/home/foo/index.html"; static String home_folder = "/home/foo/"; static String start_webURL = "http://www.oracle.com/";
public static void main(String args[]) throws Exception {
String webUrl = start_webURL; URL url = new URL(webUrl); URLConnection connection = url.openConnection(); InputStream is = connection.getInputStream(); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr);
HTMLEditorKit htmlKit = new HTMLEditorKit(); HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator(); HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true); FileWriter writer = new FileWriter(result_doc); htmlKit.write(writer, htmlDoc, 0, htmlDoc.getLength()); writer.close();
int number_or_images = 0; String[] array = new String[4096];
for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.IMG); iterator.isValid(); iterator.next()) { AttributeSet attributes = iterator.getAttributes(); String imgSrc = (String) attributes.getAttribute(HTML.Attribute.SRC); System.out.println("img_src = " + imgSrc);
if (imgSrc != null && (imgSrc.endsWith(".jpg") || (imgSrc.endsWith(".png")) || (imgSrc.endsWith(".jpeg")) || (imgSrc.endsWith(".bmp")) || (imgSrc.endsWith(".ico")))) { try { downloadImage(webUrl, imgSrc); } catch (IOException ex) { System.out.println(ex.getMessage()); } } array[number_or_images] = imgSrc; number_or_images++; ///TODO change
}
for(int i =0; i < number_or_images; i++) { System.out.println("before = "+array[i]); while(true) { int count = array[i].indexOf('/'); if(count == -1) break; array[i] = array[i].substring(count+1); } System.out.println("after = " + array[i]); }
//TODO open file and replace tags int i =0; File input = new File(result_doc); Document doc = Jsoup.parse(input, "UTF-8"); System.out.println( input.canWrite()); for( Element img : doc.select("img[src]") ) { String s = img.attr("src"); System.out.println(s);
img.attr("src", "/home/foo/"+array[i]); // set attribute 'src' to 'your-source-here' s = img.attr("src"); System.out.println(s); ++i; }
} private static void downloadImage(String url, String imgSrc) throws IOException { BufferedImage image = null; try { if (!(imgSrc.startsWith("http"))) { url = url + imgSrc; } else { url = imgSrc; } imgSrc = imgSrc.substring(imgSrc.lastIndexOf("/") + 1); String imageFormat = null; imageFormat = imgSrc.substring(imgSrc.lastIndexOf(".") + 1); String imgPath = null; imgPath = home_folder + imgSrc + ""; URL imageUrl = new URL(url); image = ImageIO.read(imageUrl); if (image != null) { File file = new File(imgPath); ImageIO.write(image, imageFormat, file); } } catch (Exception ex) { ex.printStackTrace(); }
} }
|
|