VariantSync · pmbittner · Jan 28, 2024 · Jan 27, 2024 · Jan 27, 2024 · Jan 27, 2024
diff --git a/src/main/java/org/variantsync/diffdetective/AnalysisRunner.java b/src/main/java/org/variantsync/diffdetective/AnalysisRunner.java
@@ -57,6 +57,23 @@ public record Options(
              */
             boolean pullRepositoriesBeforeAnalysis
     ) {
+        /**
+         * Creates options with the given parameters and uses default
+         * values for all other parameters.
+         * @see Options#Options(Path, Path, Path, Function, Function, boolean, boolean)
+         * @see Options#DEFAULT(String[]) 
+         */
+        public Options(Path repositoriesDirectory,
+                       Path outputDirectory,
+                       Path datasetsFile) {
+            this(
+                    repositoriesDirectory, outputDirectory, datasetsFile,
+                    Repository::getParseOptions,
+                    Repository::getDiffFilter,
+                    true,
+                    false);
+        }
+
         public static Options DEFAULT(final String[] args) {
             final Path datasetsFile;
             if (args.length < 1) {

diff --git a/src/main/java/org/variantsync/diffdetective/analysis/Analysis.java b/src/main/java/org/variantsync/diffdetective/analysis/Analysis.java
@@ -20,13 +20,9 @@
 import org.variantsync.diffdetective.diff.git.GitDiffer;
 import org.variantsync.diffdetective.diff.git.PatchDiff;
 import org.variantsync.diffdetective.diff.result.CommitDiffResult;
-import org.variantsync.diffdetective.diff.result.DiffError;
 import org.variantsync.diffdetective.metadata.Metadata;
 import org.variantsync.diffdetective.parallel.ScheduledTasksIterator;
-import org.variantsync.diffdetective.util.Assert;
-import org.variantsync.diffdetective.util.Clock;
-import org.variantsync.diffdetective.util.Diagnostics;
-import org.variantsync.diffdetective.util.InvocationCounter;
+import org.variantsync.diffdetective.util.*;
 import org.variantsync.diffdetective.variation.DiffLinesLabel;
 import org.variantsync.diffdetective.variation.diff.Time;
 import org.variantsync.diffdetective.variation.diff.VariationDiff;
@@ -42,8 +38,8 @@
  * provides access to the current state of the analysis in one thread. Depending on the current
  * {@link Hooks phase} only a subset of the state accessible via getters may be valid.
  *
- * @see forEachRepository
- * @see forEachCommit
+ * @see #forEachRepository
+ * @see #forEachCommit
  * @author Paul Bittner, Benjamin Moosherr
  */
 public class Analysis {
@@ -57,7 +53,7 @@ public class Analysis {
     public static final String TOTAL_RESULTS_FILE_NAME = "totalresult" + EXTENSION;
     /**
      * Default value for <code>commitsToProcessPerThread</code>
-     * @see forEachCommit(Supplier, int, int)
+     * @see #forEachCommit(Supplier, int, int)
      */
     public static final int COMMITS_TO_PROCESS_PER_THREAD_DEFAULT = 1000;
 
@@ -73,6 +69,38 @@ public class Analysis {
     protected final Path outputDir;
     protected Path outputFile;
     protected final AnalysisResult result;
+
+    /**
+     * The effective runtime in seconds that we have when using multithreading.
+     */
+    public final static class TotalNumberOfCommitsResult extends SimpleMetadata<Integer, TotalNumberOfCommitsResult> {
+        public final static ResultKey<TotalNumberOfCommitsResult> KEY = new ResultKey<>(TotalNumberOfCommitsResult.class.getName());
+
+        public TotalNumberOfCommitsResult() {
+            super(
+                    0,
+                    MetadataKeys.TOTAL_COMMITS,
+                    Integer::sum,
+                    Integer::parseInt
+            );
+        }
+    }
+
+    /**
+     * The effective runtime in seconds that we have when using multithreading.
+     */
+    public final static class RuntimeWithMultithreadingResult extends SimpleMetadata<Double, RuntimeWithMultithreadingResult> {
+        public final static ResultKey<RuntimeWithMultithreadingResult> KEY = new ResultKey<>(RuntimeWithMultithreadingResult.class.getName());
+
+        public RuntimeWithMultithreadingResult() {
+            super(
+                    0.0,
+                    MetadataKeys.RUNTIME_WITH_MULTITHREADING,
+                    Double::sum,
+                    Double::parseDouble
+            );
+        }
+    }
 
     /**
      * The repository this analysis is run on.
@@ -133,23 +161,23 @@ public Path getOutputFile() {
 
     /**
      * The results of the analysis. This may be modified by any hook and should be initialized in
-     * {@link Hooks#initializeResults} (e.g. by using {@link append}).
+     * {@link Hooks#initializeResults} (e.g. by using {@link #append}).
      * Always valid.
      */
     public AnalysisResult getResult() {
         return result;
     }
 
     /**
-     * Convenience getter for {@link AnalysisResult#get} on {@link getResult}.
+     * Convenience getter for {@link AnalysisResult#get} on {@link #getResult}.
      * Always valid.
      */
     public <T extends Metadata<T>> T get(ResultKey<T> resultKey) {
         return result.get(resultKey);
     }
 
     /**
-     * Convenience function for {@link AnalysisResult#append} on {@link getResult}.
+     * Convenience function for {@link AnalysisResult#append} on {@link #getResult}.
      * Always valid.
      */
     public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
@@ -179,13 +207,13 @@ public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
      * end hooks).
      *
      * <p>An analysis implementing {@code Hooks} can perform various actions during each hook. This
-     * includes the {@link append creation} and {@link get modification} of {@link getResult
+     * includes the {@link #append creation} and {@link #get modification} of {@link #getResult
      * analysis results}, modifying their internal state, performing IO operations and throwing
      * exceptions. In contrast, the only analysis state hooks are allowed to modify is the {@link
-     * getResult result} of an {@link Analysis}. All other state (e.g. {@link getCurrentCommit})
+     * #getResult result} of an {@link Analysis}. All other state (e.g. {@link #getCurrentCommit})
      * must not be modified. Care must be taken to avoid the reliance of the internal state on a
-     * specific commit batch being processed as only the {@link getResult results} of each commit
-     * batch are merged and returned by {@link forEachCommit}.
+     * specific commit batch being processed as only the {@link #getResult results} of each commit
+     * batch are merged and returned by {@link #forEachCommit}.
      *
      * <p>Hooks that return a {@code boolean} are called filter hooks and can, in addition to the
      * above, skip any further processing in the current phase (including following inner phases) by
@@ -198,8 +226,8 @@ public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
      */
     public interface Hooks {
         /**
-         * Initialization hook for {@link getResult}. All result types should be appended with a
-         * neutral value using {@link append}. No other side effects should be performed during this
+         * Initialization hook for {@link #getResult}. All result types should be appended with a
+         * neutral value using {@link #append}. No other side effects should be performed during this
          * methods as it might be called an arbitrary amount of times.
          */
         default void initializeResults(Analysis analysis) {}
@@ -208,7 +236,7 @@ default void beginBatch(Analysis analysis) throws Exception {}
         /**
          * Signals a parsing failure of all patches in the current commit.
          * Called at most once during the commit phase. If this hook is called {@link
-         * onParsedCommit} and the following patch phase invocations are skipped.
+         * #onParsedCommit} and the following patch phase invocations are skipped.
          */
         default void onFailedCommit(Analysis analysis) throws Exception {}
         /**
@@ -235,9 +263,9 @@ default void endBatch(Analysis analysis) throws Exception {}
     /**
      * Runs {@code analyzeRepository} on each repository, skipping repositories where an analysis
      * was already run. This skipping mechanism doesn't distinguish between different analyses as it
-     * only checks for the existence of {@link TOTAL_RESULTS_FILE_NAME}. Delete this file to rerun
+     * only checks for the existence of {@link #TOTAL_RESULTS_FILE_NAME}. Delete this file to rerun
      * the analysis.
-     *
+     * <p>
      * For each repository a directory in {@code outputDir} is passed to {@code analyzeRepository}
      * where the results of the given repository should be written.
      *
@@ -289,17 +317,17 @@ public static AnalysisResult forSingleCommit(final String commitHash, final Anal
         AnalysisResult result = null;
         try {
             final RevCommit commit = analysis.differ.getCommit(commitHash);
-            result = analysis.processCommits(List.of(commit), analysis.differ);
+            analysis.processCommitBatch(List.of(commit));
+            result = analysis.getResult();
         } catch (Exception e) {
             Logger.error("Failed to analyze {}. Exiting.", commitHash);
             System.exit(1);
         }
 
         final double runtime = clock.getPassedSeconds();
         Logger.info("<<< done in {}", Clock.printPassedSeconds(runtime));
-
-        result.runtimeWithMultithreadingInSeconds = -1;
-        result.totalCommits = 1;
+
+        result.get(TotalNumberOfCommitsResult.KEY).value++;
 
         exportMetadata(analysis.getOutputDir(), result);
         return result;
@@ -339,8 +367,8 @@ public boolean beginPatch(Analysis analysis) {
     }
 
     /**
-     * Same as {@link forEachCommit(Supplier<Analysis>, int, int)}.
-     * Defaults to {@link COMMITS_TO_PROCESS_PER_THREAD_DEFAULT} and a machine dependent number of
+     * Same as {@link #forEachCommit(Supplier, int, int)}.
+     * Defaults to {@link #COMMITS_TO_PROCESS_PER_THREAD_DEFAULT} and a machine dependent number of
      * {@link Diagnostics#getNumberOfAvailableProcessors}.
      */
     public static AnalysisResult forEachCommit(Supplier<Analysis> analysis) {
@@ -370,6 +398,7 @@ public static AnalysisResult forEachCommit(
     ) {
         var analysis = analysisFactory.get();
         analysis.differ = new GitDiffer(analysis.getRepository());
+        analysis.result.append(RuntimeWithMultithreadingResult.KEY, new RuntimeWithMultithreadingResult());
 
         final Clock clock = new Clock();
 
@@ -385,7 +414,12 @@ public static AnalysisResult forEachCommit(
                 ),
                 /// 2.) Create a MiningTask for the list of commits. This task will then be processed by one
                 ///     particular thread.
-                commitList -> () -> analysisFactory.get().processCommits(commitList, analysis.differ)
+                commitList -> () -> {
+                    Analysis thisThreadsAnalysis = analysisFactory.get();
+                    thisThreadsAnalysis.differ = analysis.differ;
+                    thisThreadsAnalysis.processCommitBatch(commitList);
+                    return thisThreadsAnalysis.getResult();
+                }
         );
         Logger.info("<<< done in {}", clock.printPassedSeconds());
 
@@ -411,8 +445,8 @@ public static AnalysisResult forEachCommit(
         final double runtime = clock.getPassedSeconds();
         Logger.info("<<< done in {}", Clock.printPassedSeconds(runtime));
 
-        analysis.getResult().runtimeWithMultithreadingInSeconds = runtime;
-        analysis.getResult().totalCommits = numberOfTotalCommits.invocationCount().get();
+        analysis.getResult().get(RuntimeWithMultithreadingResult.KEY).value = runtime;
+//        analysis.getResult().get(TotalNumberOfCommitsResult.KEY).value = numberOfTotalCommits.invocationCount().get();
 
         exportMetadata(analysis.getOutputDir(), analysis.getResult());
         return analysis.getResult();
@@ -435,39 +469,22 @@ public Analysis(
         this.hooks = hooks;
         this.repository = repository;
         this.outputDir = outputDir;
-        this.result = new AnalysisResult();
-
-        this.result.repoName = repository.getRepositoryName();
+
+        this.result = new AnalysisResult(repository.getRepositoryName());
         this.result.taskName = taskName;
+        this.result.append(TotalNumberOfCommitsResult.KEY, new TotalNumberOfCommitsResult());
+
         for (var hook : hooks) {
             hook.initializeResults(this);
         }
     }
 
     /**
-     * Entry point into a sequential analysis of {@code commits} as one batch.
-     * Same as {@link processCommits(List<RevCommit>, GitDiffer)} with a default {@link GitDiffer}.
+     * Sequential analysis of all {@code commits} as one batch.
      *
      * @param commits the commit batch to be processed
-     * @see forEachCommit
+     * @see #forEachCommit
      */
-    public AnalysisResult processCommits(List<RevCommit> commits) throws Exception {
-        return processCommits(commits, new GitDiffer(getRepository()));
-    }
-
-    /**
-     * Entry point into a sequential analysis of {@code commits} as one batch.
-     *
-     * @param commits the commit batch to be processed
-     * @param differ the differ to use
-     * @see forEachCommit
-     */
-    public AnalysisResult processCommits(List<RevCommit> commits, GitDiffer differ) throws Exception {
-        this.differ = differ;
-        processCommitBatch(commits);
-        return getResult();
-    }
-
     protected void processCommitBatch(List<RevCommit> commits) throws Exception {
         outputFile = outputDir.resolve(commits.get(0).getId().getName());
 
@@ -495,6 +512,9 @@ protected void processCommitBatch(List<RevCommit> commits) throws Exception {
             }
         } finally {
             runReverseHook(batchHook, Hooks::endBatch);
+
+            // export the thread's result
+            getResult().exportTo(FileUtils.addExtension(outputFile, Analysis.EXTENSION));
         }
     }
 
@@ -540,6 +560,8 @@ protected void processCommit() throws Exception {
                 runReverseHook(patchHook, Hooks::endPatch);
             }
         }
+
+        getResult().get(TotalNumberOfCommitsResult.KEY).value++;
     }
 
     protected void processPatch() throws Exception {
@@ -590,7 +612,7 @@ protected <Hook> void runReverseHook(ListIterator<Hook> hook, FailableBiConsumer
 
     /**
      * Exports the given metadata object to a file named according
-     * {@link TOTAL_RESULTS_FILE_NAME} in the given directory.
+     * {@link #TOTAL_RESULTS_FILE_NAME} in the given directory.
      * @param outputDir The directory into which the metadata object file should be written.
      * @param metadata The metadata to serialize
      * @param <T> Type of the metadata.

diff --git a/src/main/java/org/variantsync/diffdetective/analysis/AnalysisResult.java b/src/main/java/org/variantsync/diffdetective/analysis/AnalysisResult.java
@@ -38,14 +38,6 @@ public final class AnalysisResult implements Metadata<AnalysisResult> {
      */
     public String repoName = NO_REPO;
     public String taskName;
-    /**
-     * The effective runtime in seconds that we have when using multithreading.
-     */
-    public double runtimeWithMultithreadingInSeconds = 0;
-    /**
-     * The total number of commits in the observed history of the given repository.
-     */
-    public int totalCommits = 0;
     public final MergeMap<DiffError, Integer> diffErrors = new MergeMap<>(new HashMap<>(), Integer::sum);
 
     private final Map<String, Metadata<?>> results = new HashMap<>();
@@ -106,8 +98,6 @@ public <T extends Metadata<T>> void append(ResultKey<T> resultKey, T value) {
                     return ar + "; " + br;
                 });
         a.taskName = Metadata.mergeEqual(a.taskName, b.taskName);
-        a.runtimeWithMultithreadingInSeconds += b.runtimeWithMultithreadingInSeconds;
-        a.totalCommits += b.totalCommits;
         a.diffErrors.append(b.diffErrors);
         b.results.forEach((key, value) -> a.unsafeAppend(key, value));
     };
@@ -147,12 +137,11 @@ public LinkedHashMap<String, Object> snapshot() {
         LinkedHashMap<String, Object> snap = new LinkedHashMap<>();
         snap.put(MetadataKeys.TASKNAME, taskName);
         snap.put(MetadataKeys.REPONAME, repoName);
-        snap.put(MetadataKeys.RUNTIME_WITH_MULTITHREADING, runtimeWithMultithreadingInSeconds);
-        snap.put(MetadataKeys.TOTAL_COMMITS, totalCommits);
 
         var statistics = get(StatisticsAnalysis.RESULT);
+        var globals    = get(Analysis.TotalNumberOfCommitsResult.KEY);
         if (statistics != null) {
-            snap.put(MetadataKeys.FILTERED_COMMITS, totalCommits - statistics.processedCommits - statistics.emptyCommits - statistics.failedCommits);
+            snap.put(MetadataKeys.FILTERED_COMMITS, globals.value - statistics.processedCommits - statistics.emptyCommits - statistics.failedCommits);
         }
 
         for (var result : results.values()) {
@@ -168,14 +157,6 @@ public void setFromSnapshot(LinkedHashMap<String, String> snap) {
         repoName = snap.get(MetadataKeys.REPONAME);
         taskName = snap.get(MetadataKeys.TASKNAME);
 
-        String runtime = snap.get(MetadataKeys.RUNTIME_WITH_MULTITHREADING);
-        if (runtime.endsWith("s")) {
-            runtime = runtime.substring(0, runtime.length() - 1);
-        }
-        runtimeWithMultithreadingInSeconds = Double.parseDouble(runtime);
-
-        totalCommits = Integer.parseInt(snap.get(MetadataKeys.TOTAL_COMMITS));
-
         for (var entry : snap.entrySet()) {
             String key = entry.getKey();
             if (entry.getKey().startsWith(ERROR_BEGIN)) {