Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add logging to crawl.log for metadata records created by ExtractorYoutubeDL #593

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions contrib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
<artifactId>Java-WebSocket</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.16.1</version>
</dependency>
</dependencies>
<repositories>
<repository>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand All @@ -43,7 +44,9 @@
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.frontier.AMQPUrlReceiver;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.format.warc.WARCConstants.WARCRecordType;
Expand All @@ -52,8 +55,6 @@
import org.archive.modules.CrawlURI;
import org.archive.modules.warc.BaseWARCRecordBuilder;
import org.archive.modules.warc.WARCRecordBuilder;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.MimetypeUtils;
import org.springframework.beans.factory.annotation.Autowired;
Expand Down Expand Up @@ -109,6 +110,7 @@ public class ExtractorYoutubeDL extends Extractor
protected static final String YDL_CONTAINING_PAGE_DIGEST = "ydl-containing-page-digest";
protected static final String YDL_CONTAINING_PAGE_TIMESTAMP = "ydl-containing-page-timestamp";
protected static final String YDL_CONTAINING_PAGE_URI = "ydl-containing-page-uri";
protected static final String YDL_JSON_FILE_DIGEST = "ydl-json-file-digest";

protected static final int MAX_VIDEOS_PER_PAGE = 1000;

Expand Down Expand Up @@ -173,6 +175,25 @@ public void setCrawlerLoggerModule(CrawlerLoggerModule crawlerLoggerModule) {
this.crawlerLoggerModule = crawlerLoggerModule;
}

@Autowired
protected CrawlController controller;
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}

{
setLogMetadataRecord(true);
}
public boolean getLogMetadataRecord() {
return (Boolean) kp.get("logMetadataRecord");
}
/**
* Whether or not to create a crawl.log entry for any WARC Metadata Records written.
*/
public void setLogMetadataRecord(boolean logMetadataRecord) {
kp.put("logMetadataRecord",logMetadataRecord);
}

@Override
public void start() {
if (!isRunning) {
Expand Down Expand Up @@ -372,6 +393,12 @@ public int read(byte b[], int off, int len) throws IOException {
}
}

/** Dummy output stream to swallow bytes without storing anything. */
public class NullOutputStream extends OutputStream {
@Override
public void write(int b) throws IOException {}
}

/**
* Streams through youtube-dl json output. Sticks video urls in
* <code>results.videoUrls</code>, web page urls in
Expand Down Expand Up @@ -583,6 +610,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)

getLocalTempFile().seek(0);
InputStream inputStream = Channels.newInputStream(getLocalTempFile().getChannel());
curi.getData().put(YDL_JSON_FILE_DIGEST, DigestUtils.sha1(inputStream));
//Leave InputStream open for warc writer to handle

getLocalTempFile().seek(0);
recordInfo.setContentStream(inputStream);
recordInfo.setContentLength(getLocalTempFile().length());

Expand All @@ -591,6 +622,43 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
return recordInfo;
}

/**
* Because we are writing an additional WARC Metadata Record for the json video info, there is no CrawlURI for that
* record, and thus nothing ever goes through the frontier to be logged to the crawl.log. To log this capture we
* Create a CrawlURI <code>pseudoCuri</code> object and assign the appropriate values and then call to the logger.
*
* @param recordInfo WARCRecordInfo object that was just written
* @param curi CrawlURI that generated the WARCRecordInfo Object
*/
@Override
public void postWrite(WARCRecordInfo recordInfo, CrawlURI curi) {
if(!this.getLogMetadataRecord())
return;

CrawlURI pseudoCuri = null;
try {
pseudoCuri = curi.createCrawlURI(recordInfo.getUrl(), LinkContext.EMBED_MISC, Hop.INFERRED);

pseudoCuri.getAnnotations().add("youtube-dl:");
pseudoCuri.setThreadNumber(curi.getThreadNumber());
pseudoCuri.setContentSize(recordInfo.getContentLength());
pseudoCuri.setContentType(recordInfo.getMimetype());
pseudoCuri.addExtraInfo("warcFilename", recordInfo.getWARCFilename());
pseudoCuri.addExtraInfo("warcFileOffset", recordInfo.getWARCFileOffset());
pseudoCuri.setFetchStatus(204);
pseudoCuri.setContentDigest("sha1",(byte[])curi.getData().get(YDL_JSON_FILE_DIGEST));
pseudoCuri.addExtraInfo("contentSize", recordInfo.getContentLength());

Object array[] = {pseudoCuri};
this.controller.getLoggerModule().getUriProcessing().log(Level.INFO,
curi.getUURI().toString(), array);
} catch (URIException e) {
logger.log(Level.WARNING, "Exception while parsing UURI for youtube-dl metadata record " + recordInfo.getUrl(), e);
} catch (IOException e) {
logger.log(Level.WARNING, "Exception while generating digest for youtube-dl metadata record " + recordInfo.getUrl(), e);
}
}

public static void main(String[] args) throws IOException {
/*
File t = File.createTempFile("ydl", ".json");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package org.archive.modules.extractor;

import org.apache.commons.io.IOUtils;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.modules.CrawlURI;

import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.Channels;
import java.util.ArrayList;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;

public class ExtractorYoutubeDLTest extends ContentExtractorTestBase {

protected String getTestUri() {
return "https://www.youtube.com/watch?v=i08NNO-DPgg";
}
protected String getTestResourceFileName() {
return "ExtractorYoutubeDL.json";
}
protected String getTestResourceSha1() { return "WFD7RIFCGNFVAWBEWLF6T2HXPXDEZY45"; }

/**
* Test that we have the expected WARC Metadata Record given a json output from yt-dlp
* @throws Exception
*/
public void testBuildRecord() throws Exception {
CrawlURI testUri = CrawlURI.fromHopsViaString(getTestUri());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(getTestResourceFileName());
byte[] json_results = IOUtils.toByteArray(is);
ExtractorYoutubeDL ex = (ExtractorYoutubeDL)extractor;
OutputStream os = Channels.newOutputStream(ex.getLocalTempFile().getChannel());
IOUtils.write(json_results, os);
WARCRecordInfo record = ex.buildRecord(testUri, null);

assertEquals(record.getUrl(),"youtube-dl:" + getTestUri());
assertEquals(record.getType(), WARCConstants.WARCRecordType.metadata);
assertEquals(record.getMimetype(),"application/vnd.youtube-dl_formats+json;charset=utf-8");

//Test input file is the same content as the content to be written to warc
byte[] output_array = IOUtils.toByteArray(record.getContentStream());
long json_len = json_results.length;
long out_len = output_array.length;
org.junit.Assert.assertArrayEquals(json_results, output_array);
}

/**
* Test that the resuling log line is as expected, and the resulting hash string matches
* @throws Exception
*/
public void testPostWrite() throws Exception {

CrawlURI testUri = CrawlURI.fromHopsViaString(getTestUri());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(getTestResourceFileName());
byte[] json_results = IOUtils.toByteArray(is);
ExtractorYoutubeDL ex = (ExtractorYoutubeDL)extractor;
OutputStream os = Channels.newOutputStream(ex.getLocalTempFile().getChannel());
IOUtils.write(json_results, os);

WARCRecordInfo record = ex.buildRecord(testUri, null);

ex.controller.setLoggerModule(new CrawlerLoggerModule() {
@Override
public Logger getUriProcessing() {
Logger logger = Logger.getLogger(ExtractorYoutubeDL.class.getName());

logger.setLevel(Level.ALL);
return logger;
}
});
Logger logger = ex.controller.getLoggerModule().getUriProcessing();
TestLogHandler logHandler = new TestLogHandler();
logger.addHandler(logHandler);
UriProcessingFormatter formatter = new UriProcessingFormatter(true);

ex.setLogMetadataRecord(false);
ex.postWrite(record, testUri);
assert(logHandler.getLines().length == 0);

ex.setLogMetadataRecord(true);
ex.postWrite(record, testUri);
LogRecord[] logLines = logHandler.getLines();
assert(logHandler.getLines().length>0);
String message = formatter.format(logHandler.getLines()[0]);
String expected_crawl_log_line = " 204 434699 youtube-dl:https://www.youtube.com/watch?v=i08NNO-DPgg I https://www.youtube.com/watch?v=i08NNO-DPgg application/vnd.youtube-dl_formats+json #000 - sha1:WFD7RIFCGNFVAWBEWLF6T2HXPXDEZY45 - youtube-dl: {\"contentSize\":434699}";
assert(message.contains(expected_crawl_log_line));



}

@Override
protected Extractor makeExtractor() {
CrawlController controller = new CrawlController();
ExtractorYoutubeDL ex = new ExtractorYoutubeDL();
ex.setCrawlController(controller);

UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
ex.setLoggerModule(ulm);

return ex;
}
}

/**
* Helper class to let us inspect the individual LogRecords
*/
class TestLogHandler extends Handler
{
ArrayList<LogRecord> logLines;
public TestLogHandler() {
super();
this.logLines = new ArrayList<LogRecord>();
}

public LogRecord[] getLines() {
return this.logLines.toArray(new LogRecord[]{});
}

public void publish(LogRecord record) {
this.logLines.add(record);
}

public void close(){}
public void flush(){}
}
1 change: 1 addition & 0 deletions contrib/src/test/resources/ExtractorYoutubeDL.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.archive.modules.warc;

import org.archive.io.warc.WARCRecordInfo;
import org.archive.modules.CrawlURI;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.UUID;
Expand All @@ -12,4 +15,7 @@ public static URI generateRecordID() {
throw new RuntimeException(e); // impossible
}
}
public void postWrite(WARCRecordInfo recordInfo, CrawlURI curi) {
return;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@ public interface WARCRecordBuilder {
WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
throws IOException;

}
void postWrite(WARCRecordInfo warcRecordInfo, CrawlURI curi);

}
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
if (recordBuilder.shouldBuildRecord(curi)) {
WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo);
if (record != null) {
record.setWARCFileOffset(writer.getPosition());
writer.writeRecord(record);
record.setWARCFilename(writer.getFilenameWithoutOccupiedSuffix());
InputStream is = null;
try {
is = record.getContentStream();
Expand All @@ -178,6 +180,7 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
if (concurrentTo == null) {
concurrentTo = record.getRecordId();
}
recordBuilder.postWrite(record, curi);
}
}
}
Expand Down
Loading