From a34543913389955f1045f936d3c76933ee11b970 Mon Sep 17 00:00:00 2001 From: Android PowerUser <88908510+Android-PowerUser@users.noreply.github.com> Date: Mon, 8 Jun 2026 22:33:56 +0200 Subject: [PATCH] Use completed command for automatic screenshot flow --- .../ScreenOperatorAccessibilityService.kt | 6 +++- .../PhotoReasoningCommandProcessing.kt | 2 ++ .../multimodal/PhotoReasoningScreen.kt | 1 + .../multimodal/PhotoReasoningViewModel.kt | 33 +++++++++++++------ .../com/google/ai/sample/util/Command.kt | 1 + .../google/ai/sample/util/CommandParser.kt | 11 +++++-- .../sample/util/SystemMessagePreferences.kt | 2 +- .../ai/sample/util/CommandParserTest.kt | 19 +++++++++++ 8 files changed, 60 insertions(+), 15 deletions(-) diff --git a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt index 7177d414..0c3af802 100644 --- a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt +++ b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt @@ -243,6 +243,10 @@ class ScreenOperatorAccessibilityService : AccessibilityService() { true // Asynchronous } is Command.TakeScreenshot -> executeTakeScreenshotCommand() + is Command.Completed -> { + Log.d(TAG, "Command.Completed: No accessibility action required.") + true + } is Command.Wait -> { pendingScreenshotDelayMillis = command.seconds .coerceAtLeast(0L) @@ -420,7 +424,7 @@ class ScreenOperatorAccessibilityService : AccessibilityService() { } } }.also { - if (command !is Command.TakeScreenshot && command !is Command.TermuxCommand) { + if (command !is Command.TakeScreenshot && command !is Command.TermuxCommand && command !is Command.Completed) { sawNonTermuxCommandSinceLastScreenshot = true } } diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt index 1ae5fd20..9f3685b8 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt @@ -6,6 +6,7 @@ import com.google.ai.sample.util.CommandParser internal data class ParsedCommandBatch( val commands: List, val hasTakeScreenshotCommand: Boolean, + val hasCompletedCommand: Boolean, val commandDescriptions: String ) @@ -19,6 +20,7 @@ internal object PhotoReasoningCommandProcessing { return ParsedCommandBatch( commands = commands, hasTakeScreenshotCommand = commands.any { it is Command.TakeScreenshot }, + hasCompletedCommand = commands.any { it is Command.Completed }, commandDescriptions = commands.joinToString("; ") { it.toString() } ) } diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt index 92864c01..211e4fa8 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt @@ -392,6 +392,7 @@ fun PhotoReasoningScreen( is Command.ClickButton -> "Click on button: \"${command.buttonText}\"" is Command.TapCoordinates -> "Tap coordinates: (${command.x}, ${command.y})" is Command.TakeScreenshot -> "Take screenshot" + is Command.Completed -> "Completed" is Command.Wait -> "Wait: ${command.seconds} seconds" is Command.Retrieve -> "Retrieve: \"${command.heading}\"" else -> command::class.simpleName ?: "Unknown Command" diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt index c3b4b053..c0be47d5 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt @@ -2235,12 +2235,17 @@ class PhotoReasoningViewModel( for (command in newCommands) { if (stopExecutionFlag.get()) break - // Skip takeScreenshot during streaming - it will be handled by final processCommands + // Skip commands that are handled only after streaming has finished. if (command is Command.TakeScreenshot) { Log.d(TAG, "Incremental: Skipping takeScreenshot during streaming (will be handled at end)") incrementalCommandCount++ continue } + if (command is Command.Completed) { + Log.d(TAG, "Incremental: completed() received; stopping incremental command execution") + incrementalCommandCount++ + break + } try { Log.d(TAG, "Incremental: Executing command: $command") @@ -2267,17 +2272,29 @@ private fun processCommands(text: String) { if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) return@launch // Check for cancellation try { val commandBatch = PhotoReasoningCommandProcessing.parseForFinalExecution(text) - val commands = commandBatch.commands + val parsedCommands = commandBatch.commands + val hasCompletedCommand = commandBatch.hasCompletedCommand val hasTakeScreenshotCommand = commandBatch.hasTakeScreenshotCommand + val commandsBeforeCompletion = if (hasCompletedCommand) { + parsedCommands.takeWhile { it !is Command.Completed } + Command.Completed + } else { + parsedCommands + } + val commands = if (hasCompletedCommand || hasTakeScreenshotCommand) { + commandsBeforeCompletion + } else { + commandsBeforeCompletion + Command.TakeScreenshot + } val commandsToExecute = commands.mapIndexedNotNull { index, command -> when { + command is Command.Completed -> null command is Command.Retrieve -> null index < incrementalCommandCount && command !is Command.TakeScreenshot -> null else -> command } } - if (hasTakeScreenshotCommand) { + if (!hasCompletedCommand) { pendingRetrievedInfoForNextScreenshot = buildRetrievedInfoForNextScreenshot(commands) } @@ -2293,7 +2310,7 @@ private fun processCommands(text: String) { commands = commands ) _commandExecutionStatus.value = PhotoReasoningCommandStateUpdater.buildDetectedStatus( - commandBatch.commandDescriptions + commands.joinToString("; ") { it.toString() } ) } @@ -2324,12 +2341,8 @@ private fun processCommands(text: String) { } } - // Toast anzeigen wenn kein takeScreenshot Command gefunden wurde - if (!hasTakeScreenshotCommand && !text.contains("takeScreenshot()", ignoreCase = true)) { - val context = MainActivity.getInstance() - if (context != null) { - PhotoReasoningCommandUiNotifier.showStoppedByAi(context) - } + if (hasCompletedCommand) { + _commandExecutionStatus.value = "Task marked completed by AI." } } catch (e: Exception) { diff --git a/app/src/main/kotlin/com/google/ai/sample/util/Command.kt b/app/src/main/kotlin/com/google/ai/sample/util/Command.kt index f82628f4..fdf27171 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/Command.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/Command.kt @@ -8,6 +8,7 @@ sealed class Command { data class LongClickButton(val buttonText: String) : Command() data class TapCoordinates(val x: String, val y: String) : Command() object TakeScreenshot : Command() + object Completed : Command() data class Wait(val seconds: Long) : Command() object PressHomeButton : Command() object PressBackButton : Command() diff --git a/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt b/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt index 62c920cf..79cc0b88 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt @@ -7,11 +7,14 @@ import android.util.Log */ object CommandParser { private const val TAG = "CommandParser" - private val SINGLE_INSTANCE_COMMAND_TYPES = setOf(CommandTypeEnum.TAKE_SCREENSHOT) + private val SINGLE_INSTANCE_COMMAND_TYPES = setOf( + CommandTypeEnum.TAKE_SCREENSHOT, + CommandTypeEnum.COMPLETED + ) // Enum to represent different command types private enum class CommandTypeEnum { - CLICK_BUTTON, LONG_CLICK_BUTTON, TAP_COORDINATES, TAKE_SCREENSHOT, WAIT, PRESS_HOME, PRESS_BACK, + CLICK_BUTTON, LONG_CLICK_BUTTON, TAP_COORDINATES, TAKE_SCREENSHOT, COMPLETED, WAIT, PRESS_HOME, PRESS_BACK, SHOW_RECENT_APPS, SCROLL_DOWN, SCROLL_UP, SCROLL_LEFT, SCROLL_RIGHT, SCROLL_DOWN_FROM_COORDINATES, SCROLL_UP_FROM_COORDINATES, SCROLL_LEFT_FROM_COORDINATES, SCROLL_RIGHT_FROM_COORDINATES, @@ -53,8 +56,9 @@ object CommandParser { // Tap coordinates patterns PatternInfo("tapCoords1", Regex("(?i)\\btapAtCoordinates\\(\\s*([\\d\\.%]+)\\s*,\\s*([\\d\\.%]+)\\s*\\)"), { match -> Command.TapCoordinates(match.groupValues[1], match.groupValues[2]) }, CommandTypeEnum.TAP_COORDINATES), - // Screenshot and wait patterns + // Screenshot, completion and wait patterns PatternInfo("screenshot1", Regex("(?i)\\btakeScreenshot\\(\\)"), { Command.TakeScreenshot }, CommandTypeEnum.TAKE_SCREENSHOT), + PatternInfo("completed1", Regex("(?i)\\bcompleted\\(\\)"), { Command.Completed }, CommandTypeEnum.COMPLETED), PatternInfo("wait1", Regex("(?i)\\bWait\\(\\s*(\\d+)\\s*\\)"), { match -> Command.Wait(match.groupValues[1].toLong()) }, CommandTypeEnum.WAIT), // Home button patterns @@ -151,6 +155,7 @@ object CommandParser { is Command.LongClickButton -> Log.d(TAG, "Command details: LongClickButton(\"${command.buttonText}\")") is Command.TapCoordinates -> Log.d(TAG, "Command details: TapCoordinates(${command.x}, ${command.y})") is Command.TakeScreenshot -> Log.d(TAG, "Command details: TakeScreenshot") + is Command.Completed -> Log.d(TAG, "Command details: Completed") is Command.Wait -> Log.d(TAG, "Command details: Wait(${command.seconds})") is Command.PressHomeButton -> Log.d(TAG, "Command details: PressHomeButton") is Command.PressBackButton -> Log.d(TAG, "Command details: PressBackButton") diff --git a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt index 4ad868c6..f3e2df95 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt @@ -14,7 +14,7 @@ object SystemMessagePreferences { private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag // Content from pasted_content.txt - private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "Wait(seconds)" "Termux("command")" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. If you have questions, open Screen Operator, ask your question(s), and do not use takeScreenshot() until you receive an answer. Retrieve information using "retrieve("sample")" if some is passed to your task. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()"""" + private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "Wait(seconds)" "Termux("command")" "completed()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. If you have questions, open Screen Operator, ask your question(s), and use completed() until you receive an answer. Retrieve information using "retrieve("sample")" if some is passed to your task. After each message, you will see the screen with additional information about it. Say "completed()" when the task is finished.""" private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) /** diff --git a/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt b/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt index 361b894f..4b263996 100644 --- a/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt +++ b/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt @@ -78,4 +78,23 @@ class CommandParserTest { assertEquals(7L, (wait as Command.Wait).seconds) assertTrue(commands[1] is Command.TakeScreenshot) } + + @Test + fun parseCommands_extractsCompletedCommand() { + val commands = CommandParser.parseCommands("completed()", clearBuffer = true) + + assertEquals(1, commands.size) + assertTrue(commands.first() is Command.Completed) + } + + @Test + fun parseCommands_keepsSingleCompletedCommandInstance() { + val commands = CommandParser.parseCommands( + "completed() completed()", + clearBuffer = true + ) + + assertEquals(1, commands.count { it is Command.Completed }) + } + }