From 42ae37229d0aa3a0fd6e07e1bbe3003938af260a Mon Sep 17 00:00:00 2001
From: Matthew Ball <mgball@uci.edu>
Date: Wed, 1 Jul 2026 10:45:18 -0700
Subject: [PATCH] refactor(sklearn): moved base class to shared file between
 sklearnclassifieropdesc and sklearntrainingopdesc

---
 .../sklearn/SklearnClassifierOpDesc.scala     | 74 +-------------
 .../operator/sklearn/SklearnModelOpDesc.scala | 97 +++++++++++++++++++
 .../training/SklearnTrainingOpDesc.scala      | 75 +-------------
 .../sklearn/SklearnOpDescRegistrySpec.scala   |  2 +-
 4 files changed, 105 insertions(+), 143 deletions(-)
 create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala

diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala
index 0c8a103c52b..92aec692a6e 100644
--- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala
@@ -19,73 +19,15 @@
 
 package org.apache.texera.amber.operator.sklearn
 
-import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription}
-import com.kjetland.jackson.jsonSchema.annotations.{
-  JsonSchemaInject,
-  JsonSchemaInt,
-  JsonSchemaString,
-  JsonSchemaTitle
-}
-import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
 import org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext
-import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
 import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity}
-import org.apache.texera.amber.operator.PythonOperatorDescriptor
-import org.apache.texera.amber.operator.metadata.annotations.{
-  AutofillAttributeName,
-  CommonOpDescAnnotation,
-  HideAnnotation
-}
 import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo}
 
-abstract class SklearnClassifierOpDesc extends PythonOperatorDescriptor {
-
-  @JsonSchemaTitle("Target Attribute")
-  @JsonPropertyDescription("Attribute in your dataset corresponding to target.")
-  @JsonProperty(required = true)
-  @AutofillAttributeName
-  var target: EncodableString = _
+abstract class SklearnClassifierOpDesc extends SklearnModelOpDesc {
 
-  @JsonSchemaTitle("Count Vectorizer")
-  @JsonPropertyDescription("Convert a collection of text documents to a matrix of token counts.")
-  @JsonProperty(defaultValue = "false")
-  var countVectorizer: Boolean = false
+  override def getImportStatements = ""
 
-  @JsonSchemaTitle("Text Attribute")
-  @JsonPropertyDescription("Attribute in your dataset with text to vectorize.")
-  @JsonSchemaInject(
-    strings = Array(
-      new JsonSchemaString(
-        path = CommonOpDescAnnotation.autofill,
-        value = CommonOpDescAnnotation.attributeName
-      ),
-      new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"),
-      new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals),
-      new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false")
-    ),
-    ints = Array(
-      new JsonSchemaInt(path = CommonOpDescAnnotation.autofillAttributeOnPort, value = 0)
-    )
-  )
-  var text: EncodableString = _
-
-  @JsonSchemaTitle("Tfidf Transformer")
-  @JsonPropertyDescription("Transform a count matrix to a normalized tf or tf-idf representation.")
-  @JsonProperty(defaultValue = "false")
-  @JsonSchemaInject(
-    strings = Array(
-      new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"),
-      new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals),
-      new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false")
-    )
-  )
-  val tfidfTransformer: Boolean = false
-
-  @JsonIgnore
-  def getImportStatements = ""
-
-  @JsonIgnore
-  def getUserFriendlyModelName = ""
+  override def getUserFriendlyModelName = ""
 
   override def generatePythonCode(): String =
     pyb"""$getImportStatements
@@ -126,14 +68,4 @@ abstract class SklearnClassifierOpDesc extends PythonOperatorDescriptor {
       ),
       outputPorts = List(OutputPort(blocking = true))
     )
-
-  override def getOutputSchemas(
-      inputSchemas: Map[PortIdentity, Schema]
-  ): Map[PortIdentity, Schema] = {
-    Map(
-      operatorInfo.outputPorts.head.id -> Schema()
-        .add("model_name", AttributeType.STRING)
-        .add("model", AttributeType.BINARY)
-    )
-  }
 }
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala
new file mode 100644
index 00000000000..6665477aeac
--- /dev/null
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.sklearn
+
+import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription}
+import com.kjetland.jackson.jsonSchema.annotations.{
+  JsonSchemaInject,
+  JsonSchemaInt,
+  JsonSchemaString,
+  JsonSchemaTitle
+}
+import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
+import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
+import org.apache.texera.amber.core.workflow.PortIdentity
+import org.apache.texera.amber.operator.PythonOperatorDescriptor
+import org.apache.texera.amber.operator.metadata.annotations.{
+  AutofillAttributeName,
+  CommonOpDescAnnotation,
+  HideAnnotation
+}
+
+abstract class SklearnModelOpDesc extends PythonOperatorDescriptor {
+
+  @JsonSchemaTitle("Target Attribute")
+  @JsonPropertyDescription("Attribute in your dataset corresponding to target.")
+  @JsonProperty(required = true)
+  @AutofillAttributeName
+  var target: EncodableString = _
+
+  @JsonSchemaTitle("Count Vectorizer")
+  @JsonPropertyDescription("Convert a collection of text documents to a matrix of token counts.")
+  @JsonProperty(defaultValue = "false")
+  var countVectorizer: Boolean = false
+
+  @JsonSchemaTitle("Text Attribute")
+  @JsonPropertyDescription("Attribute in your dataset with text to vectorize.")
+  @JsonSchemaInject(
+    strings = Array(
+      new JsonSchemaString(
+        path = CommonOpDescAnnotation.autofill,
+        value = CommonOpDescAnnotation.attributeName
+      ),
+      new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"),
+      new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals),
+      new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false")
+    ),
+    ints = Array(
+      new JsonSchemaInt(path = CommonOpDescAnnotation.autofillAttributeOnPort, value = 0)
+    )
+  )
+  var text: EncodableString = _
+
+  @JsonSchemaTitle("Tfidf Transformer")
+  @JsonPropertyDescription("Transform a count matrix to a normalized tf or tf-idf representation.")
+  @JsonProperty(defaultValue = "false")
+  @JsonSchemaInject(
+    strings = Array(
+      new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"),
+      new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals),
+      new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false")
+    )
+  )
+  var tfidfTransformer: Boolean = false
+
+  @JsonIgnore
+  def getImportStatements: String
+
+  @JsonIgnore
+  def getUserFriendlyModelName: String
+
+  override def getOutputSchemas(
+      inputSchemas: Map[PortIdentity, Schema]
+  ): Map[PortIdentity, Schema] = {
+    Map(
+      operatorInfo.outputPorts.head.id -> Schema()
+        .add("model_name", AttributeType.STRING)
+        .add("model", AttributeType.BINARY)
+    )
+  }
+}
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala
index a00cc7ec9c8..d7daada8d92 100644
--- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala
@@ -19,73 +19,16 @@
 
 package org.apache.texera.amber.operator.sklearn.training
 
-import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription}
-import com.kjetland.jackson.jsonSchema.annotations.{
-  JsonSchemaInject,
-  JsonSchemaInt,
-  JsonSchemaString,
-  JsonSchemaTitle
-}
-import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
 import org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext
-import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
 import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity}
-import org.apache.texera.amber.operator.PythonOperatorDescriptor
-import org.apache.texera.amber.operator.metadata.annotations.{
-  AutofillAttributeName,
-  CommonOpDescAnnotation,
-  HideAnnotation
-}
 import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo}
+import org.apache.texera.amber.operator.sklearn.SklearnModelOpDesc
 
-class SklearnTrainingOpDesc extends PythonOperatorDescriptor {
-
-  @JsonSchemaTitle("Target Attribute")
-  @JsonPropertyDescription("Attribute in your dataset corresponding to target.")
-  @JsonProperty(required = true)
-  @AutofillAttributeName
-  var target: EncodableString = _
-
-  @JsonSchemaTitle("Count Vectorizer")
-  @JsonPropertyDescription("Convert a collection of text documents to a matrix of token counts.")
-  @JsonProperty(defaultValue = "false")
-  var countVectorizer: Boolean = false
-
-  @JsonSchemaTitle("Text Attribute")
-  @JsonPropertyDescription("Attribute in your dataset with text to vectorize.")
-  @JsonSchemaInject(
-    strings = Array(
-      new JsonSchemaString(
-        path = CommonOpDescAnnotation.autofill,
-        value = CommonOpDescAnnotation.attributeName
-      ),
-      new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"),
-      new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals),
-      new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false")
-    ),
-    ints = Array(
-      new JsonSchemaInt(path = CommonOpDescAnnotation.autofillAttributeOnPort, value = 0)
-    )
-  )
-  var text: EncodableString = _
-
-  @JsonSchemaTitle("Tfidf Transformer")
-  @JsonPropertyDescription("Transform a count matrix to a normalized tf or tf-idf representation.")
-  @JsonProperty(defaultValue = "false")
-  @JsonSchemaInject(
-    strings = Array(
-      new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"),
-      new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals),
-      new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false")
-    )
-  )
-  var tfidfTransformer: Boolean = false
+class SklearnTrainingOpDesc extends SklearnModelOpDesc {
 
-  @JsonIgnore
-  def getImportStatements = "from sklearn.ensemble import RandomForestClassifier"
+  override def getImportStatements = "from sklearn.ensemble import RandomForestClassifier"
 
-  @JsonIgnore
-  def getUserFriendlyModelName = "RandomForest Training"
+  override def getUserFriendlyModelName = "RandomForest Training"
 
   override def generatePythonCode(): String =
     pyb"""$getImportStatements
@@ -115,14 +58,4 @@ class SklearnTrainingOpDesc extends PythonOperatorDescriptor {
       inputPorts = List(InputPort(PortIdentity(), "training")),
       outputPorts = List(OutputPort(blocking = true))
     )
-
-  override def getOutputSchemas(
-      inputSchemas: Map[PortIdentity, Schema]
-  ): Map[PortIdentity, Schema] = {
-    Map(
-      operatorInfo.outputPorts.head.id -> Schema()
-        .add("model_name", AttributeType.STRING)
-        .add("model", AttributeType.BINARY)
-    )
-  }
 }
diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala
index 29e80162d6b..d0598b92319 100644
--- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala
+++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala
@@ -166,7 +166,7 @@ class SklearnOpDescRegistrySpec extends AnyFlatSpec {
     val desc = new SklearnLogisticRegressionOpDesc()
     desc.target = "y"
     desc.countVectorizer = false
-    // `tfidfTransformer` is a val on the base class, defaults to false.
+    // `tfidfTransformer` is defined on the shared base class, defaults to false.
     val code = desc.generatePythonCode()
     assert(code.contains("from sklearn.linear_model import LogisticRegression"))
     // Classifier OpDescs emit a UDFTableOperator pipeline.