Try to fix detecting replies to a comment on the previous page

When getting a page which is not the initial page there it is possible that the first comments are replies to a comment from a previous page.
2023-01-02 18:59:03 +01:00 · 2023-01-02 18:59:03 +01:00 · e5be686b06
commit e5be686b06
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/Page.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/Page.java
@ -13,7 +13,7 @@ import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
 * <br>
 * A page has an {@link #id}, an {@link #url}, as well as information on possible {@link #cookies}.
 * In case the data behind the URL has already been retrieved,
- * it can be accessed by using @link #getBody()} and {@link #getContent()}.
+ * it can be accessed by using {@link #getBody()} or {@link #getContent()}.
 */
 public class Page implements Serializable {
    private final String url;
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsExtractor.java
@ -21,13 +21,24 @@ import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
 import org.schabi.newpipe.extractor.services.soundcloud.SoundcloudParsingHelper;

 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;

 import javax.annotation.Nonnull;
+import javax.annotation.Nullable;

 public class SoundcloudCommentsExtractor extends CommentsExtractor {
    public static final String COLLECTION = "collection";
    public static final String NEXT_HREF = "next_href";

+    /**
+     * The last comment which was a top level comment.
+     * Next pages might start with replies to the last top level comment
+     * and therefore the {@link SoundcloudCommentsInfoItemExtractor#replyCount}
+     * of the last top level comment cannot be determined certainly.
+     */
+    @Nullable private JsonObject lastTopLevelComment;
+
    public SoundcloudCommentsExtractor(final StreamingService service,
                                       final ListLinkHandler uiHandler) {
        super(service, uiHandler);
@ -50,14 +61,15 @@ public class SoundcloudCommentsExtractor extends CommentsExtractor {
        final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(
                getServiceId());

-        collectCommentsFrom(collector, json);
+        collectCommentsFrom(collector, json, null);

        return new InfoItemsPage<>(collector, new Page(json.getString(NEXT_HREF)));
    }

    @Override
-    public InfoItemsPage<CommentsInfoItem> getPage(final Page page) throws ExtractionException,
-            IOException {
+    public InfoItemsPage<CommentsInfoItem> getPage(final Page page)
+            throws ExtractionException, IOException {
+
        if (page == null || isNullOrEmpty(page.getUrl())) {
            throw new IllegalArgumentException("Page doesn't contain an URL");
        }
@ -88,7 +100,7 @@ public class SoundcloudCommentsExtractor extends CommentsExtractor {
            } catch (final JsonParserException e) {
                throw new ParsingException("Could not parse json", e);
            }
-            collectCommentsFrom(collector, json);
+            collectCommentsFrom(collector, json, lastTopLevelComment);
        }

        if (hasNextPage) {
@ -101,27 +113,86 @@ public class SoundcloudCommentsExtractor extends CommentsExtractor {
    @Override
    public void onFetchPage(@Nonnull final Downloader downloader) { }

-    private void collectCommentsFrom(final CommentsInfoItemsCollector collector,
-                                     final JsonObject json) throws ParsingException {
+    /**
+     * Collect top level comments from a SoundCloud API response.
+     * @param collector the collector which collects the the top level comments
+     * @param json the JsonObject of the API response
+     * @param lastTopLevelComment the last top level comment from the previous page or {@code null}
+     *                            if this method is run for the initial page.
+     * @throws ParsingException
+     */
+    private void collectCommentsFrom(@Nonnull final CommentsInfoItemsCollector collector,
+                                     @Nonnull final JsonObject json,
+                                     @Nullable final JsonObject lastTopLevelComment)
+            throws ParsingException {
+        final List<SoundcloudCommentsInfoItemExtractor> extractors = new ArrayList<>();
        final String url = getUrl();
        final JsonArray entries = json.getArray(COLLECTION);
-        JsonObject lastTopComment = null;
-        for (int i = 0; i < entries.size(); i++) {
-            final JsonObject entry = entries.getObject(i);
-            if (i == 0
-                    || (!SoundcloudParsingHelper.isReplyTo(entries.getObject(i - 1), entry)
-                    && !SoundcloudParsingHelper.isReplyTo(lastTopComment, entry))) {
-                lastTopComment = entry;
-                collector.commit(new SoundcloudCommentsInfoItemExtractor(
-                        json, i, entry, url));
+        /**
+         * The current top level comment.
+         */
+        JsonObject currentTopLevelComment = null;
+        boolean isLastCommentReply = true;
+        // Check whether the first comment in the list is a reply to the last top level comment
+        // from the previous page if there was a previous page.
+        if (lastTopLevelComment != null) {
+            final JsonObject firstComment = entries.getObject(0);
+            if (SoundcloudParsingHelper.isReplyTo(lastTopLevelComment, firstComment)) {
+                currentTopLevelComment = lastTopLevelComment;
+            } else {
+                extractors.add(new SoundcloudCommentsInfoItemExtractor(
+                        json, SoundcloudCommentsInfoItemExtractor.PREVIOUS_PAGE_INDEX,
+                        firstComment, url, null));
            }
        }
+
+        for (int i = 0; i < entries.size(); i++) {
+            final JsonObject entry = entries.getObject(i);
+            // extract all top level comments
+            // The first comment is either a top level comment
+            // if it is not a reply to the last top level comment
+            //
+            if (i == 0 && currentTopLevelComment == null
+                    || (!SoundcloudParsingHelper.isReplyTo(entries.getObject(i - 1), entry)
+                    && !SoundcloudParsingHelper.isReplyTo(currentTopLevelComment, entry))) {
+                currentTopLevelComment = entry;
+                if (i == entries.size() - 1) {
+                    isLastCommentReply = false;
+                    this.lastTopLevelComment = currentTopLevelComment;
+                    // Do not collect the last comment if it is a top level comment
+                    // because it might have replies.
+                    // That is information we cannot get from the comment itself
+                    // (thanks SoundCloud...) but needs to be obtained from the next comment.
+                    // The comment will therefore be collected
+                    // when collecting the items from the next page.
+                    break;
+                }
+                extractors.add(new SoundcloudCommentsInfoItemExtractor(
+                        json, i, entry, url, lastTopLevelComment));
+            }
+        }
+        if (isLastCommentReply) {
+            // Do not collect the last top level comment if it has replies and the retrieved
+            // comment list ends with a reply. We do not know whether the next page starts
+            // with more replies to the last top level comment.
+            this.lastTopLevelComment = extractors.remove(extractors.size() - 1).item;
+        }
+        extractors.stream().forEach(collector::commit);
+
    }

-    private boolean collectRepliesFrom(final CommentsInfoItemsCollector collector,
-                                    final JsonObject json,
-                                    final int id,
-                                    final String url) {
+    /**
+     * Collect replies to a top level comment from a SoundCloud API response.
+     * @param collector the collector which collects the the replies
+     * @param json the SoundCloud API response
+     * @param id the comment's id for which the replies are collected
+     * @param url the corresponding page's URL
+     * @return
+     */
+    private boolean collectRepliesFrom(@Nonnull final CommentsInfoItemsCollector collector,
+                                       @Nonnull final JsonObject json,
+                                       final int id,
+                                       @Nonnull final String url) {
        JsonObject originalComment = null;
        final JsonArray entries = json.getArray(COLLECTION);
        boolean moreReplies = false;
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsInfoItemExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsInfoItemExtractor.java
@ -6,10 +6,8 @@ import com.grack.nanojson.JsonArray;
 import com.grack.nanojson.JsonObject;

 import org.schabi.newpipe.extractor.Page;
-import org.schabi.newpipe.extractor.ServiceList;
 import org.schabi.newpipe.extractor.comments.CommentsInfoItem;
 import org.schabi.newpipe.extractor.comments.CommentsInfoItemExtractor;
-import org.schabi.newpipe.extractor.comments.CommentsInfoItemsCollector;
 import org.schabi.newpipe.extractor.exceptions.ParsingException;
 import org.schabi.newpipe.extractor.localization.DateWrapper;
 import org.schabi.newpipe.extractor.services.soundcloud.SoundcloudParsingHelper;
@ -17,32 +15,42 @@ import org.schabi.newpipe.extractor.stream.Description;

 import java.util.Objects;

+import javax.annotation.Nonnull;
 import javax.annotation.Nullable;

 public class SoundcloudCommentsInfoItemExtractor implements CommentsInfoItemExtractor {
+    public static final int PREVIOUS_PAGE_INDEX = -1;
    public static final String BODY = "body";
    public static final String USER_PERMALINK = "permalink";
    public static final String USER_FULL_NAME = "full_name";
    public static final String USER_USERNAME = "username";

-    private final JsonObject json;
+    @Nonnull private final JsonObject json;
    private final int index;
-    private final JsonObject item;
+    @Nonnull public final JsonObject item;
    private final String url;
-    private final JsonObject user;
-    private final JsonObject superComment;
+    @Nonnull private final JsonObject user;
+    /**
+     * A comment to which this comment is a reply.
+     * Is {@code null} if this comment is itself a top level comment.
+     */
+    @Nullable private final JsonObject topLevelComment;

+    /**
+     * The reply count is not given by the SoundCloud API, but needs to be obtained
+     * by counting the comments which come directly after this item and have the same timestamp.
+     */
    private int replyCount = CommentsInfoItem.UNKNOWN_REPLY_COUNT;
    private Page repliesPage = null;

-    public SoundcloudCommentsInfoItemExtractor(final JsonObject json, final int index,
-                                               final JsonObject item, final String url,
-                                               @Nullable final JsonObject superComment) {
+    public SoundcloudCommentsInfoItemExtractor(@Nonnull final JsonObject json, final int index,
+                                               @Nonnull final JsonObject item, final String url,
+                                               @Nullable final JsonObject topLevelComment) {
        this.json = json;
        this.index = index;
        this.item = item;
        this.url = url;
-        this.superComment = superComment;
+        this.topLevelComment = topLevelComment;
        this.user = item.getObject("user");
    }

@ -58,7 +66,7 @@ public class SoundcloudCommentsInfoItemExtractor implements CommentsInfoItemExtr
    @Override
    public Description getCommentText() {
        String commentContent = item.getString(BODY);
-        if (superComment == null) {
+        if (topLevelComment == null) {
            return new Description(commentContent, Description.PLAIN_TEXT);
        }
        // This comment is a reply to another comment.
@ -78,7 +86,7 @@ public class SoundcloudCommentsInfoItemExtractor implements CommentsInfoItemExtr
                }
            }
            if (author == null) {
-                author = superComment.getObject("user");
+                author = topLevelComment.getObject("user");
            }
            final String name = isNullOrEmpty(author.getString(USER_FULL_NAME))
                    ? author.getString(USER_USERNAME) : author.getString(USER_FULL_NAME);
@ -149,24 +157,17 @@ public class SoundcloudCommentsInfoItemExtractor implements CommentsInfoItemExtr
    @Override
    public Page getReplies() {
        if (replyCount == CommentsInfoItem.UNKNOWN_REPLY_COUNT) {
-            final JsonArray replies = new JsonArray();
-            final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(
-                    ServiceList.SoundCloud.getServiceId());
+            replyCount = 0;
            // SoundCloud has only comments and top level replies, but not nested replies.
            // Therefore, replies cannot have further replies.
-            if (superComment == null) {
+            if (topLevelComment == null) {
                // Loop through all comments which come after the original comment
                // to find its replies.
                final JsonArray allItems = json.getArray(SoundcloudCommentsExtractor.COLLECTION);
-                boolean foundReply = false;
                for (int i = index + 1; i < allItems.size(); i++) {
-                    final JsonObject comment = allItems.getObject(i);
-                    if (SoundcloudParsingHelper.isReplyTo(item, comment)) {
-                        replies.add(comment);
-                        collector.commit(new SoundcloudCommentsInfoItemExtractor(
-                                json, i, comment, url, item));
-                        foundReply = true;
-                    } else if (foundReply) {
+                    if (SoundcloudParsingHelper.isReplyTo(item, allItems.getObject(i))) {
+                        replyCount++;
+                    } else {
                        // Only the comments directly after the original comment
                        // having the same timestamp are replies to the original comment.
                        // The first comment not having the same timestamp
@ -175,8 +176,7 @@ public class SoundcloudCommentsInfoItemExtractor implements CommentsInfoItemExtr
                    }
                }
            }
-            replyCount = replies.size();
-            if (collector.getItems().isEmpty()) {
+            if (replyCount == 0) {
                return null;
            }
            repliesPage = new Page(getUrl(), getCommentId());