/*
 * Decompiled with CFR 0.152.
 */
package picard.sam.markduplicates;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.IterableAdapter;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import java.io.File;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.programgroups.SamOrBam;
import picard.sam.markduplicates.MarkDuplicatesWithMateCigarIterator;
import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;

@CommandLineProgramProperties(summary="Identifies duplicate reads, accounting for mate CIGAR.  This tool locates and tags duplicate reads (both PCR and optical) in a BAM or SAM file, where duplicate reads are defined as originating from the same original fragment of DNA, taking into account the CIGAR string of read mates. <br /><br />It is intended as an improvement upon the original MarkDuplicates algorithm, from which it differs in several ways, includingdifferences in how it breaks ties. It may be the most effective duplicate marking program available, as it handles all cases including clipped and gapped alignments and locates duplicate molecules using mate cigar information. However, please note that it is not yet used in the Broad's production pipeline, so use it at your own risk. <br /><br />Note also that this tool will not work with alignments that have large gaps or deletions, such as those from RNA-seq data.  This is due to the need to buffer small genomic windows to ensure integrity of the duplicate marking, while large skips (ex. skipping introns) in the alignment records would force making that window very large, thus exhausting memory. <br /><p>Note: Metrics labeled as percentages are actually expressed as fractions!</p><h4>Usage example:</h4><pre>java -jar picard.jar MarkDuplicatesWithMateCigar \\<br />      I=input.bam \\<br />      O=mark_dups_w_mate_cig.bam \\<br />      M=mark_dups_w_mate_cig_metrics.txt</pre><hr />", oneLineSummary="Identifies duplicate reads, accounting for mate CIGAR.  ", programGroup=SamOrBam.class)
@DocumentedFeature
public class MarkDuplicatesWithMateCigar
extends AbstractMarkDuplicatesCommandLineProgram {
    static final String USAGE_SUMMARY = "Identifies duplicate reads, accounting for mate CIGAR.  ";
    static final String USAGE_DETAILS = "This tool locates and tags duplicate reads (both PCR and optical) in a BAM or SAM file, where duplicate reads are defined as originating from the same original fragment of DNA, taking into account the CIGAR string of read mates. <br /><br />It is intended as an improvement upon the original MarkDuplicates algorithm, from which it differs in several ways, includingdifferences in how it breaks ties. It may be the most effective duplicate marking program available, as it handles all cases including clipped and gapped alignments and locates duplicate molecules using mate cigar information. However, please note that it is not yet used in the Broad's production pipeline, so use it at your own risk. <br /><br />Note also that this tool will not work with alignments that have large gaps or deletions, such as those from RNA-seq data.  This is due to the need to buffer small genomic windows to ensure integrity of the duplicate marking, while large skips (ex. skipping introns) in the alignment records would force making that window very large, thus exhausting memory. <br /><p>Note: Metrics labeled as percentages are actually expressed as fractions!</p><h4>Usage example:</h4><pre>java -jar picard.jar MarkDuplicatesWithMateCigar \\<br />      I=input.bam \\<br />      O=mark_dups_w_mate_cig.bam \\<br />      M=mark_dups_w_mate_cig_metrics.txt</pre><hr />";
    private final Log log = Log.getInstance(MarkDuplicatesWithMateCigar.class);
    @Argument(doc="The minimum distance to buffer records to account for clipping on the 5' end of the records. For a given alignment, this parameter controls the width of the window to search for duplicates of that alignment. Due to 5' read clipping, duplicates do not necessarily have the same 5' alignment coordinates, so the algorithm needs to search around the neighborhood. For single end sequencing data, the neighborhood is only determined by the amount of clipping (assuming no split reads), thus setting MINIMUM_DISTANCE to twice the sequencing read length should be sufficient. For paired end sequencing, the neighborhood is also determined by the fragment insert size, so you may want to set MINIMUM_DISTANCE to something like twice the 99.5% percentile of the fragment insert size distribution (see CollectInsertSizeMetrics). Or you can set this number to -1 to use either a) twice the first read's read length, or b) 100, whichever is smaller. Note that the larger the window, the greater the RAM requirements, so you could run into performance limitations if you use a value that is unnecessarily large.", optional=true)
    public int MINIMUM_DISTANCE = -1;
    @Argument(doc="Skip record pairs with no mate cigar and include them in the output.")
    boolean SKIP_PAIRS_WITH_NO_MATE_CIGAR = true;
    @Argument(doc="The block size for use in the coordinate-sorted record buffer.", optional=true)
    public int BLOCK_SIZE = 100000;
    private boolean warnedNullProgramRecords = false;
    private boolean warnedMissingProgramRecords = false;

    public static void main(String[] args) {
        new MarkDuplicatesWithMateCigar().instanceMainWithExit(args);
    }

    @Override
    protected int doWork() {
        IOUtil.assertInputsAreValid((List)this.INPUT);
        IOUtil.assertFileIsWritable((File)this.OUTPUT);
        IOUtil.assertFileIsWritable((File)this.METRICS_FILE);
        AbstractMarkDuplicatesCommandLineProgram.SamHeaderAndIterator headerAndIterator = this.openInputs(true);
        SAMFileHeader header = headerAndIterator.header;
        SAMFileHeader outputHeader = header.clone();
        if (outputHeader.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new PicardException("This program requires inputs in coordinate SortOrder");
        }
        this.COMMENT.forEach(arg_0 -> ((SAMFileHeader)outputHeader).addComment(arg_0));
        this.setPGIdsSeen(outputHeader);
        Map<String, String> chainedPgIds = this.getChainedPgIds(outputHeader);
        SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader, true, this.OUTPUT);
        MarkDuplicatesWithMateCigarIterator iterator = new MarkDuplicatesWithMateCigarIterator(headerAndIterator.header, headerAndIterator.iterator, this.opticalDuplicateFinder, this.DUPLICATE_SCORING_STRATEGY, this.MINIMUM_DISTANCE, this.REMOVE_DUPLICATES, this.SKIP_PAIRS_WITH_NO_MATE_CIGAR, this.MAX_RECORDS_IN_RAM, this.BLOCK_SIZE, this.TMP_DIR);
        ProgressLogger progress = new ProgressLogger(this.log, 1000000, "Read");
        for (SAMRecord record : new IterableAdapter((Iterator)((Object)iterator))) {
            if (progress.record(record)) {
                iterator.logMemoryStats(this.log);
            }
            this.updateProgramRecord(record, chainedPgIds);
            out.addAlignment(record);
        }
        iterator.close();
        out.close();
        Histogram<Short> opticalDupesByLibraryId = iterator.getOpticalDupesByLibraryId();
        this.log.info(new Object[]{"Processed " + progress.getCount() + " records"});
        this.log.info(new Object[]{"Found " + iterator.getNumRecordsWithNoMateCigar() + " records with no mate cigar optional tag."});
        this.log.info(new Object[]{"Marking " + iterator.getNumDuplicates() + " records as duplicates."});
        this.log.info(new Object[]{"Found " + (long)opticalDupesByLibraryId.getSumOfValues() + " optical duplicate clusters."});
        this.finalizeAndWriteMetrics(iterator.getLibraryIdGenerator());
        return 0;
    }

    private void updateProgramRecord(SAMRecord record, Map<String, String> chainedPgIds) {
        if (this.PROGRAM_RECORD_ID != null && this.ADD_PG_TAG_TO_READS.booleanValue()) {
            String pgId = record.getStringAttribute(SAMTag.PG.name());
            if (null == pgId) {
                if (!this.warnedNullProgramRecords) {
                    this.warnedNullProgramRecords = true;
                    this.log.warn(new Object[]{"Encountered a record with no program record, program group chaining will not occur for this read: " + record});
                }
            } else if (!chainedPgIds.containsKey(pgId)) {
                if (!this.warnedMissingProgramRecords) {
                    this.warnedMissingProgramRecords = true;
                    this.log.warn(new Object[]{"Encountered a record with an intermediate program record, program group chaining will not occur for this read: " + record});
                }
            } else {
                record.setAttribute(SAMTag.PG.name(), (Object)chainedPgIds.get(pgId));
            }
        }
    }

    private void setPGIdsSeen(SAMFileHeader header) {
        HashSet<String> pgIdsSeenAsPrevious = new HashSet<String>();
        for (SAMProgramRecord samProgramRecord : header.getProgramRecords()) {
            String previousProgramGroupID = samProgramRecord.getPreviousProgramGroupId();
            if (null == previousProgramGroupID) continue;
            pgIdsSeenAsPrevious.add(previousProgramGroupID);
        }
        for (SAMProgramRecord samProgramRecord : header.getProgramRecords()) {
            String pgId = samProgramRecord.getId();
            if (pgIdsSeenAsPrevious.contains(pgId)) continue;
            this.pgIdsSeen.add(pgId);
        }
    }
}

