Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ class ScreenOperatorAccessibilityService : AccessibilityService() {
true // Asynchronous
}
is Command.TakeScreenshot -> executeTakeScreenshotCommand()
is Command.Completed -> {
Log.d(TAG, "Command.Completed: No accessibility action required.")
true
}
is Command.Wait -> {
pendingScreenshotDelayMillis = command.seconds
.coerceAtLeast(0L)
Expand Down Expand Up @@ -420,7 +424,7 @@ class ScreenOperatorAccessibilityService : AccessibilityService() {
}
}
}.also {
if (command !is Command.TakeScreenshot && command !is Command.TermuxCommand) {
if (command !is Command.TakeScreenshot && command !is Command.TermuxCommand && command !is Command.Completed) {
sawNonTermuxCommandSinceLastScreenshot = true
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import com.google.ai.sample.util.CommandParser
internal data class ParsedCommandBatch(
val commands: List<Command>,
val hasTakeScreenshotCommand: Boolean,
val hasCompletedCommand: Boolean,
val commandDescriptions: String
)

Expand All @@ -19,6 +20,7 @@ internal object PhotoReasoningCommandProcessing {
return ParsedCommandBatch(
commands = commands,
hasTakeScreenshotCommand = commands.any { it is Command.TakeScreenshot },
hasCompletedCommand = commands.any { it is Command.Completed },
commandDescriptions = commands.joinToString("; ") { it.toString() }
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ fun PhotoReasoningScreen(
is Command.ClickButton -> "Click on button: \"${command.buttonText}\""
is Command.TapCoordinates -> "Tap coordinates: (${command.x}, ${command.y})"
is Command.TakeScreenshot -> "Take screenshot"
is Command.Completed -> "Completed"
is Command.Wait -> "Wait: ${command.seconds} seconds"
is Command.Retrieve -> "Retrieve: \"${command.heading}\""
else -> command::class.simpleName ?: "Unknown Command"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2235,12 +2235,17 @@ class PhotoReasoningViewModel(
for (command in newCommands) {
if (stopExecutionFlag.get()) break

// Skip takeScreenshot during streaming - it will be handled by final processCommands
// Skip commands that are handled only after streaming has finished.
if (command is Command.TakeScreenshot) {
Log.d(TAG, "Incremental: Skipping takeScreenshot during streaming (will be handled at end)")
incrementalCommandCount++
continue
}
if (command is Command.Completed) {
Log.d(TAG, "Incremental: completed() received; stopping incremental command execution")
incrementalCommandCount++
break
}

try {
Log.d(TAG, "Incremental: Executing command: $command")
Expand All @@ -2267,17 +2272,29 @@ private fun processCommands(text: String) {
if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) return@launch // Check for cancellation
try {
val commandBatch = PhotoReasoningCommandProcessing.parseForFinalExecution(text)
val commands = commandBatch.commands
val parsedCommands = commandBatch.commands
val hasCompletedCommand = commandBatch.hasCompletedCommand
val hasTakeScreenshotCommand = commandBatch.hasTakeScreenshotCommand
val commandsBeforeCompletion = if (hasCompletedCommand) {
parsedCommands.takeWhile { it !is Command.Completed } + Command.Completed
} else {
parsedCommands
}
val commands = if (hasCompletedCommand || hasTakeScreenshotCommand) {
commandsBeforeCompletion
} else {
commandsBeforeCompletion + Command.TakeScreenshot
}
val commandsToExecute = commands.mapIndexedNotNull { index, command ->
when {
command is Command.Completed -> null
command is Command.Retrieve -> null
index < incrementalCommandCount && command !is Command.TakeScreenshot -> null
else -> command
}
}

if (hasTakeScreenshotCommand) {
if (!hasCompletedCommand) {
pendingRetrievedInfoForNextScreenshot = buildRetrievedInfoForNextScreenshot(commands)
}

Expand All @@ -2293,7 +2310,7 @@ private fun processCommands(text: String) {
commands = commands
)
_commandExecutionStatus.value = PhotoReasoningCommandStateUpdater.buildDetectedStatus(
commandBatch.commandDescriptions
commands.joinToString("; ") { it.toString() }
)
}

Expand Down Expand Up @@ -2324,12 +2341,8 @@ private fun processCommands(text: String) {
}
}

// Toast anzeigen wenn kein takeScreenshot Command gefunden wurde
if (!hasTakeScreenshotCommand && !text.contains("takeScreenshot()", ignoreCase = true)) {
val context = MainActivity.getInstance()
if (context != null) {
PhotoReasoningCommandUiNotifier.showStoppedByAi(context)
}
if (hasCompletedCommand) {
_commandExecutionStatus.value = "Task marked completed by AI."
}

} catch (e: Exception) {
Expand Down
1 change: 1 addition & 0 deletions app/src/main/kotlin/com/google/ai/sample/util/Command.kt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ sealed class Command {
data class LongClickButton(val buttonText: String) : Command()
data class TapCoordinates(val x: String, val y: String) : Command()
object TakeScreenshot : Command()
object Completed : Command()
data class Wait(val seconds: Long) : Command()
object PressHomeButton : Command()
object PressBackButton : Command()
Expand Down
11 changes: 8 additions & 3 deletions app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@ import android.util.Log
*/
object CommandParser {
private const val TAG = "CommandParser"
private val SINGLE_INSTANCE_COMMAND_TYPES = setOf(CommandTypeEnum.TAKE_SCREENSHOT)
private val SINGLE_INSTANCE_COMMAND_TYPES = setOf(
CommandTypeEnum.TAKE_SCREENSHOT,
CommandTypeEnum.COMPLETED
)

// Enum to represent different command types
private enum class CommandTypeEnum {
CLICK_BUTTON, LONG_CLICK_BUTTON, TAP_COORDINATES, TAKE_SCREENSHOT, WAIT, PRESS_HOME, PRESS_BACK,
CLICK_BUTTON, LONG_CLICK_BUTTON, TAP_COORDINATES, TAKE_SCREENSHOT, COMPLETED, WAIT, PRESS_HOME, PRESS_BACK,
SHOW_RECENT_APPS, SCROLL_DOWN, SCROLL_UP, SCROLL_LEFT, SCROLL_RIGHT,
SCROLL_DOWN_FROM_COORDINATES, SCROLL_UP_FROM_COORDINATES,
SCROLL_LEFT_FROM_COORDINATES, SCROLL_RIGHT_FROM_COORDINATES,
Expand Down Expand Up @@ -53,8 +56,9 @@ object CommandParser {
// Tap coordinates patterns
PatternInfo("tapCoords1", Regex("(?i)\\btapAtCoordinates\\(\\s*([\\d\\.%]+)\\s*,\\s*([\\d\\.%]+)\\s*\\)"), { match -> Command.TapCoordinates(match.groupValues[1], match.groupValues[2]) }, CommandTypeEnum.TAP_COORDINATES),

// Screenshot and wait patterns
// Screenshot, completion and wait patterns
PatternInfo("screenshot1", Regex("(?i)\\btakeScreenshot\\(\\)"), { Command.TakeScreenshot }, CommandTypeEnum.TAKE_SCREENSHOT),
PatternInfo("completed1", Regex("(?i)\\bcompleted\\(\\)"), { Command.Completed }, CommandTypeEnum.COMPLETED),
PatternInfo("wait1", Regex("(?i)\\bWait\\(\\s*(\\d+)\\s*\\)"), { match -> Command.Wait(match.groupValues[1].toLong()) }, CommandTypeEnum.WAIT),

// Home button patterns
Expand Down Expand Up @@ -151,6 +155,7 @@ object CommandParser {
is Command.LongClickButton -> Log.d(TAG, "Command details: LongClickButton(\"${command.buttonText}\")")
is Command.TapCoordinates -> Log.d(TAG, "Command details: TapCoordinates(${command.x}, ${command.y})")
is Command.TakeScreenshot -> Log.d(TAG, "Command details: TakeScreenshot")
is Command.Completed -> Log.d(TAG, "Command details: Completed")
is Command.Wait -> Log.d(TAG, "Command details: Wait(${command.seconds})")
is Command.PressHomeButton -> Log.d(TAG, "Command details: PressHomeButton")
is Command.PressBackButton -> Log.d(TAG, "Command details: PressBackButton")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ object SystemMessagePreferences {
private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag

// Content from pasted_content.txt
private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "Wait(seconds)" "Termux("command")" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. If you have questions, open Screen Operator, ask your question(s), and do not use takeScreenshot() until you receive an answer. Retrieve information using "retrieve("sample")" if some is passed to your task. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()""""
private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "Wait(seconds)" "Termux("command")" "completed()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. If you have questions, open Screen Operator, ask your question(s), and use completed() until you receive an answer. Retrieve information using "retrieve("sample")" if some is passed to your task. After each message, you will see the screen with additional information about it. Say "completed()" when the task is finished."""
private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE)

/**
Expand Down
19 changes: 19 additions & 0 deletions app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,23 @@ class CommandParserTest {
assertEquals(7L, (wait as Command.Wait).seconds)
assertTrue(commands[1] is Command.TakeScreenshot)
}

@Test
fun parseCommands_extractsCompletedCommand() {
val commands = CommandParser.parseCommands("completed()", clearBuffer = true)

assertEquals(1, commands.size)
assertTrue(commands.first() is Command.Completed)
}

@Test
fun parseCommands_keepsSingleCompletedCommandInstance() {
val commands = CommandParser.parseCommands(
"completed() completed()",
clearBuffer = true
)

assertEquals(1, commands.count { it is Command.Completed })
}

}