Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adaptions for Demo #121

Merged
merged 15 commits into from
Jan 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/main/java/org/variantsync/diffdetective/AnalysisRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ public record Options(
*/
boolean pullRepositoriesBeforeAnalysis
) {
/**
* Creates options with the given parameters and uses default
* values for all other parameters.
* @see Options#Options(Path, Path, Path, Function, Function, boolean, boolean)
* @see Options#DEFAULT(String[])
*/
public Options(Path repositoriesDirectory,
Path outputDirectory,
Path datasetsFile) {
this(
repositoriesDirectory, outputDirectory, datasetsFile,
Repository::getParseOptions,
Repository::getDiffFilter,
true,
false);
}

public static Options DEFAULT(final String[] args) {
final Path datasetsFile;
if (args.length < 1) {
Expand Down
128 changes: 75 additions & 53 deletions src/main/java/org/variantsync/diffdetective/analysis/Analysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,9 @@
import org.variantsync.diffdetective.diff.git.GitDiffer;
import org.variantsync.diffdetective.diff.git.PatchDiff;
import org.variantsync.diffdetective.diff.result.CommitDiffResult;
import org.variantsync.diffdetective.diff.result.DiffError;
import org.variantsync.diffdetective.metadata.Metadata;
import org.variantsync.diffdetective.parallel.ScheduledTasksIterator;
import org.variantsync.diffdetective.util.Assert;
import org.variantsync.diffdetective.util.Clock;
import org.variantsync.diffdetective.util.Diagnostics;
import org.variantsync.diffdetective.util.InvocationCounter;
import org.variantsync.diffdetective.util.*;
import org.variantsync.diffdetective.variation.DiffLinesLabel;
import org.variantsync.diffdetective.variation.diff.Time;
import org.variantsync.diffdetective.variation.diff.VariationDiff;
Expand All @@ -42,8 +38,8 @@
* provides access to the current state of the analysis in one thread. Depending on the current
* {@link Hooks phase} only a subset of the state accessible via getters may be valid.
*
* @see forEachRepository
* @see forEachCommit
* @see #forEachRepository
* @see #forEachCommit
* @author Paul Bittner, Benjamin Moosherr
*/
public class Analysis {
Expand All @@ -57,7 +53,7 @@ public class Analysis {
public static final String TOTAL_RESULTS_FILE_NAME = "totalresult" + EXTENSION;
/**
* Default value for <code>commitsToProcessPerThread</code>
* @see forEachCommit(Supplier, int, int)
* @see #forEachCommit(Supplier, int, int)
*/
public static final int COMMITS_TO_PROCESS_PER_THREAD_DEFAULT = 1000;

Expand All @@ -73,6 +69,38 @@ public class Analysis {
protected final Path outputDir;
protected Path outputFile;
protected final AnalysisResult result;

/**
* The effective runtime in seconds that we have when using multithreading.
*/
public final static class TotalNumberOfCommitsResult extends SimpleMetadata<Integer, TotalNumberOfCommitsResult> {
public final static ResultKey<TotalNumberOfCommitsResult> KEY = new ResultKey<>(TotalNumberOfCommitsResult.class.getName());

public TotalNumberOfCommitsResult() {
super(
0,
MetadataKeys.TOTAL_COMMITS,
Integer::sum,
Integer::parseInt
);
}
}

/**
* The effective runtime in seconds that we have when using multithreading.
*/
public final static class RuntimeWithMultithreadingResult extends SimpleMetadata<Double, RuntimeWithMultithreadingResult> {
public final static ResultKey<RuntimeWithMultithreadingResult> KEY = new ResultKey<>(RuntimeWithMultithreadingResult.class.getName());

public RuntimeWithMultithreadingResult() {
super(
0.0,
MetadataKeys.RUNTIME_WITH_MULTITHREADING,
Double::sum,
Double::parseDouble
);
}
}

/**
* The repository this analysis is run on.
Expand Down Expand Up @@ -133,23 +161,23 @@ public Path getOutputFile() {

/**
* The results of the analysis. This may be modified by any hook and should be initialized in
* {@link Hooks#initializeResults} (e.g. by using {@link append}).
* {@link Hooks#initializeResults} (e.g. by using {@link #append}).
* Always valid.
*/
public AnalysisResult getResult() {
return result;
}

/**
* Convenience getter for {@link AnalysisResult#get} on {@link getResult}.
* Convenience getter for {@link AnalysisResult#get} on {@link #getResult}.
* Always valid.
*/
public <T extends Metadata<T>> T get(ResultKey<T> resultKey) {
return result.get(resultKey);
}

/**
* Convenience function for {@link AnalysisResult#append} on {@link getResult}.
* Convenience function for {@link AnalysisResult#append} on {@link #getResult}.
* Always valid.
*/
public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
Expand Down Expand Up @@ -179,13 +207,13 @@ public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
* end hooks).
*
* <p>An analysis implementing {@code Hooks} can perform various actions during each hook. This
* includes the {@link append creation} and {@link get modification} of {@link getResult
* includes the {@link #append creation} and {@link #get modification} of {@link #getResult
* analysis results}, modifying their internal state, performing IO operations and throwing
* exceptions. In contrast, the only analysis state hooks are allowed to modify is the {@link
* getResult result} of an {@link Analysis}. All other state (e.g. {@link getCurrentCommit})
* #getResult result} of an {@link Analysis}. All other state (e.g. {@link #getCurrentCommit})
* must not be modified. Care must be taken to avoid the reliance of the internal state on a
* specific commit batch being processed as only the {@link getResult results} of each commit
* batch are merged and returned by {@link forEachCommit}.
* specific commit batch being processed as only the {@link #getResult results} of each commit
* batch are merged and returned by {@link #forEachCommit}.
*
* <p>Hooks that return a {@code boolean} are called filter hooks and can, in addition to the
* above, skip any further processing in the current phase (including following inner phases) by
Expand All @@ -198,8 +226,8 @@ public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
*/
public interface Hooks {
/**
* Initialization hook for {@link getResult}. All result types should be appended with a
* neutral value using {@link append}. No other side effects should be performed during this
* Initialization hook for {@link #getResult}. All result types should be appended with a
* neutral value using {@link #append}. No other side effects should be performed during this
* methods as it might be called an arbitrary amount of times.
*/
default void initializeResults(Analysis analysis) {}
Expand All @@ -208,7 +236,7 @@ default void beginBatch(Analysis analysis) throws Exception {}
/**
* Signals a parsing failure of all patches in the current commit.
* Called at most once during the commit phase. If this hook is called {@link
* onParsedCommit} and the following patch phase invocations are skipped.
* #onParsedCommit} and the following patch phase invocations are skipped.
*/
default void onFailedCommit(Analysis analysis) throws Exception {}
/**
Expand All @@ -235,9 +263,9 @@ default void endBatch(Analysis analysis) throws Exception {}
/**
* Runs {@code analyzeRepository} on each repository, skipping repositories where an analysis
* was already run. This skipping mechanism doesn't distinguish between different analyses as it
* only checks for the existence of {@link TOTAL_RESULTS_FILE_NAME}. Delete this file to rerun
* only checks for the existence of {@link #TOTAL_RESULTS_FILE_NAME}. Delete this file to rerun
* the analysis.
*
* <p>
* For each repository a directory in {@code outputDir} is passed to {@code analyzeRepository}
* where the results of the given repository should be written.
*
Expand Down Expand Up @@ -289,17 +317,17 @@ public static AnalysisResult forSingleCommit(final String commitHash, final Anal
AnalysisResult result = null;
try {
final RevCommit commit = analysis.differ.getCommit(commitHash);
result = analysis.processCommits(List.of(commit), analysis.differ);
analysis.processCommitBatch(List.of(commit));
result = analysis.getResult();
} catch (Exception e) {
Logger.error("Failed to analyze {}. Exiting.", commitHash);
System.exit(1);
}

final double runtime = clock.getPassedSeconds();
Logger.info("<<< done in {}", Clock.printPassedSeconds(runtime));

result.runtimeWithMultithreadingInSeconds = -1;
result.totalCommits = 1;

result.get(TotalNumberOfCommitsResult.KEY).value++;

exportMetadata(analysis.getOutputDir(), result);
return result;
Expand Down Expand Up @@ -339,8 +367,8 @@ public boolean beginPatch(Analysis analysis) {
}

/**
* Same as {@link forEachCommit(Supplier<Analysis>, int, int)}.
* Defaults to {@link COMMITS_TO_PROCESS_PER_THREAD_DEFAULT} and a machine dependent number of
* Same as {@link #forEachCommit(Supplier, int, int)}.
* Defaults to {@link #COMMITS_TO_PROCESS_PER_THREAD_DEFAULT} and a machine dependent number of
* {@link Diagnostics#getNumberOfAvailableProcessors}.
*/
public static AnalysisResult forEachCommit(Supplier<Analysis> analysis) {
Expand Down Expand Up @@ -370,6 +398,7 @@ public static AnalysisResult forEachCommit(
) {
var analysis = analysisFactory.get();
analysis.differ = new GitDiffer(analysis.getRepository());
analysis.result.append(RuntimeWithMultithreadingResult.KEY, new RuntimeWithMultithreadingResult());

final Clock clock = new Clock();

Expand All @@ -385,7 +414,12 @@ public static AnalysisResult forEachCommit(
),
/// 2.) Create a MiningTask for the list of commits. This task will then be processed by one
/// particular thread.
commitList -> () -> analysisFactory.get().processCommits(commitList, analysis.differ)
commitList -> () -> {
Analysis thisThreadsAnalysis = analysisFactory.get();
thisThreadsAnalysis.differ = analysis.differ;
thisThreadsAnalysis.processCommitBatch(commitList);
return thisThreadsAnalysis.getResult();
}
);
Logger.info("<<< done in {}", clock.printPassedSeconds());

Expand All @@ -411,8 +445,8 @@ public static AnalysisResult forEachCommit(
final double runtime = clock.getPassedSeconds();
Logger.info("<<< done in {}", Clock.printPassedSeconds(runtime));

analysis.getResult().runtimeWithMultithreadingInSeconds = runtime;
analysis.getResult().totalCommits = numberOfTotalCommits.invocationCount().get();
analysis.getResult().get(RuntimeWithMultithreadingResult.KEY).value = runtime;
// analysis.getResult().get(TotalNumberOfCommitsResult.KEY).value = numberOfTotalCommits.invocationCount().get();

exportMetadata(analysis.getOutputDir(), analysis.getResult());
return analysis.getResult();
Expand All @@ -435,39 +469,22 @@ public Analysis(
this.hooks = hooks;
this.repository = repository;
this.outputDir = outputDir;
this.result = new AnalysisResult();

this.result.repoName = repository.getRepositoryName();

this.result = new AnalysisResult(repository.getRepositoryName());
this.result.taskName = taskName;
this.result.append(TotalNumberOfCommitsResult.KEY, new TotalNumberOfCommitsResult());

for (var hook : hooks) {
hook.initializeResults(this);
}
}

/**
* Entry point into a sequential analysis of {@code commits} as one batch.
* Same as {@link processCommits(List<RevCommit>, GitDiffer)} with a default {@link GitDiffer}.
* Sequential analysis of all {@code commits} as one batch.
*
* @param commits the commit batch to be processed
* @see forEachCommit
* @see #forEachCommit
*/
public AnalysisResult processCommits(List<RevCommit> commits) throws Exception {
return processCommits(commits, new GitDiffer(getRepository()));
}

/**
* Entry point into a sequential analysis of {@code commits} as one batch.
*
* @param commits the commit batch to be processed
* @param differ the differ to use
* @see forEachCommit
*/
public AnalysisResult processCommits(List<RevCommit> commits, GitDiffer differ) throws Exception {
this.differ = differ;
processCommitBatch(commits);
return getResult();
}

protected void processCommitBatch(List<RevCommit> commits) throws Exception {
outputFile = outputDir.resolve(commits.get(0).getId().getName());

Expand Down Expand Up @@ -495,6 +512,9 @@ protected void processCommitBatch(List<RevCommit> commits) throws Exception {
}
} finally {
runReverseHook(batchHook, Hooks::endBatch);

// export the thread's result
getResult().exportTo(FileUtils.addExtension(outputFile, Analysis.EXTENSION));
}
}

Expand Down Expand Up @@ -540,6 +560,8 @@ protected void processCommit() throws Exception {
runReverseHook(patchHook, Hooks::endPatch);
}
}

getResult().get(TotalNumberOfCommitsResult.KEY).value++;
}

protected void processPatch() throws Exception {
Expand Down Expand Up @@ -590,7 +612,7 @@ protected <Hook> void runReverseHook(ListIterator<Hook> hook, FailableBiConsumer

/**
* Exports the given metadata object to a file named according
* {@link TOTAL_RESULTS_FILE_NAME} in the given directory.
* {@link #TOTAL_RESULTS_FILE_NAME} in the given directory.
* @param outputDir The directory into which the metadata object file should be written.
* @param metadata The metadata to serialize
* @param <T> Type of the metadata.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,6 @@ public final class AnalysisResult implements Metadata<AnalysisResult> {
*/
public String repoName = NO_REPO;
public String taskName;
/**
* The effective runtime in seconds that we have when using multithreading.
*/
public double runtimeWithMultithreadingInSeconds = 0;
/**
* The total number of commits in the observed history of the given repository.
*/
public int totalCommits = 0;
public final MergeMap<DiffError, Integer> diffErrors = new MergeMap<>(new HashMap<>(), Integer::sum);

private final Map<String, Metadata<?>> results = new HashMap<>();
Expand Down Expand Up @@ -106,8 +98,6 @@ public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
return ar + "; " + br;
});
a.taskName = Metadata.mergeEqual(a.taskName, b.taskName);
a.runtimeWithMultithreadingInSeconds += b.runtimeWithMultithreadingInSeconds;
a.totalCommits += b.totalCommits;
a.diffErrors.append(b.diffErrors);
b.results.forEach((key, value) -> a.unsafeAppend(key, value));
};
Expand Down Expand Up @@ -147,12 +137,11 @@ public LinkedHashMap<String, Object> snapshot() {
LinkedHashMap<String, Object> snap = new LinkedHashMap<>();
snap.put(MetadataKeys.TASKNAME, taskName);
snap.put(MetadataKeys.REPONAME, repoName);
snap.put(MetadataKeys.RUNTIME_WITH_MULTITHREADING, runtimeWithMultithreadingInSeconds);
snap.put(MetadataKeys.TOTAL_COMMITS, totalCommits);

var statistics = get(StatisticsAnalysis.RESULT);
var globals = get(Analysis.TotalNumberOfCommitsResult.KEY);
if (statistics != null) {
snap.put(MetadataKeys.FILTERED_COMMITS, totalCommits - statistics.processedCommits - statistics.emptyCommits - statistics.failedCommits);
snap.put(MetadataKeys.FILTERED_COMMITS, globals.value - statistics.processedCommits - statistics.emptyCommits - statistics.failedCommits);
}

for (var result : results.values()) {
Expand All @@ -168,14 +157,6 @@ public void setFromSnapshot(LinkedHashMap<String, String> snap) {
repoName = snap.get(MetadataKeys.REPONAME);
taskName = snap.get(MetadataKeys.TASKNAME);

String runtime = snap.get(MetadataKeys.RUNTIME_WITH_MULTITHREADING);
if (runtime.endsWith("s")) {
runtime = runtime.substring(0, runtime.length() - 1);
}
runtimeWithMultithreadingInSeconds = Double.parseDouble(runtime);

totalCommits = Integer.parseInt(snap.get(MetadataKeys.TOTAL_COMMITS));

for (var entry : snap.entrySet()) {
String key = entry.getKey();
if (entry.getKey().startsWith(ERROR_BEGIN)) {
Expand Down
Loading
Loading