From 42ae37229d0aa3a0fd6e07e1bbe3003938af260a Mon Sep 17 00:00:00 2001 From: Matthew Ball Date: Wed, 1 Jul 2026 10:45:18 -0700 Subject: [PATCH] refactor(sklearn): moved base class to shared file between sklearnclassifieropdesc and sklearntrainingopdesc --- .../sklearn/SklearnClassifierOpDesc.scala | 74 +------------- .../operator/sklearn/SklearnModelOpDesc.scala | 97 +++++++++++++++++++ .../training/SklearnTrainingOpDesc.scala | 75 +------------- .../sklearn/SklearnOpDescRegistrySpec.scala | 2 +- 4 files changed, 105 insertions(+), 143 deletions(-) create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala index 0c8a103c52b..92aec692a6e 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnClassifierOpDesc.scala @@ -19,73 +19,15 @@ package org.apache.texera.amber.operator.sklearn -import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription} -import com.kjetland.jackson.jsonSchema.annotations.{ - JsonSchemaInject, - JsonSchemaInt, - JsonSchemaString, - JsonSchemaTitle -} -import org.apache.texera.amber.core.tuple.{AttributeType, Schema} import org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext -import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity} -import org.apache.texera.amber.operator.PythonOperatorDescriptor -import org.apache.texera.amber.operator.metadata.annotations.{ - AutofillAttributeName, - CommonOpDescAnnotation, - HideAnnotation -} import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -abstract class SklearnClassifierOpDesc extends PythonOperatorDescriptor { - - @JsonSchemaTitle("Target Attribute") - @JsonPropertyDescription("Attribute in your dataset corresponding to target.") - @JsonProperty(required = true) - @AutofillAttributeName - var target: EncodableString = _ +abstract class SklearnClassifierOpDesc extends SklearnModelOpDesc { - @JsonSchemaTitle("Count Vectorizer") - @JsonPropertyDescription("Convert a collection of text documents to a matrix of token counts.") - @JsonProperty(defaultValue = "false") - var countVectorizer: Boolean = false + override def getImportStatements = "" - @JsonSchemaTitle("Text Attribute") - @JsonPropertyDescription("Attribute in your dataset with text to vectorize.") - @JsonSchemaInject( - strings = Array( - new JsonSchemaString( - path = CommonOpDescAnnotation.autofill, - value = CommonOpDescAnnotation.attributeName - ), - new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"), - new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), - new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") - ), - ints = Array( - new JsonSchemaInt(path = CommonOpDescAnnotation.autofillAttributeOnPort, value = 0) - ) - ) - var text: EncodableString = _ - - @JsonSchemaTitle("Tfidf Transformer") - @JsonPropertyDescription("Transform a count matrix to a normalized tf or tf-idf representation.") - @JsonProperty(defaultValue = "false") - @JsonSchemaInject( - strings = Array( - new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"), - new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), - new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") - ) - ) - val tfidfTransformer: Boolean = false - - @JsonIgnore - def getImportStatements = "" - - @JsonIgnore - def getUserFriendlyModelName = "" + override def getUserFriendlyModelName = "" override def generatePythonCode(): String = pyb"""$getImportStatements @@ -126,14 +68,4 @@ abstract class SklearnClassifierOpDesc extends PythonOperatorDescriptor { ), outputPorts = List(OutputPort(blocking = true)) ) - - override def getOutputSchemas( - inputSchemas: Map[PortIdentity, Schema] - ): Map[PortIdentity, Schema] = { - Map( - operatorInfo.outputPorts.head.id -> Schema() - .add("model_name", AttributeType.STRING) - .add("model", AttributeType.BINARY) - ) - } } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala new file mode 100644 index 00000000000..6665477aeac --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/SklearnModelOpDesc.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.{ + JsonSchemaInject, + JsonSchemaInt, + JsonSchemaString, + JsonSchemaTitle +} +import org.apache.texera.amber.core.tuple.{AttributeType, Schema} +import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.operator.PythonOperatorDescriptor +import org.apache.texera.amber.operator.metadata.annotations.{ + AutofillAttributeName, + CommonOpDescAnnotation, + HideAnnotation +} + +abstract class SklearnModelOpDesc extends PythonOperatorDescriptor { + + @JsonSchemaTitle("Target Attribute") + @JsonPropertyDescription("Attribute in your dataset corresponding to target.") + @JsonProperty(required = true) + @AutofillAttributeName + var target: EncodableString = _ + + @JsonSchemaTitle("Count Vectorizer") + @JsonPropertyDescription("Convert a collection of text documents to a matrix of token counts.") + @JsonProperty(defaultValue = "false") + var countVectorizer: Boolean = false + + @JsonSchemaTitle("Text Attribute") + @JsonPropertyDescription("Attribute in your dataset with text to vectorize.") + @JsonSchemaInject( + strings = Array( + new JsonSchemaString( + path = CommonOpDescAnnotation.autofill, + value = CommonOpDescAnnotation.attributeName + ), + new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"), + new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), + new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") + ), + ints = Array( + new JsonSchemaInt(path = CommonOpDescAnnotation.autofillAttributeOnPort, value = 0) + ) + ) + var text: EncodableString = _ + + @JsonSchemaTitle("Tfidf Transformer") + @JsonPropertyDescription("Transform a count matrix to a normalized tf or tf-idf representation.") + @JsonProperty(defaultValue = "false") + @JsonSchemaInject( + strings = Array( + new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"), + new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), + new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") + ) + ) + var tfidfTransformer: Boolean = false + + @JsonIgnore + def getImportStatements: String + + @JsonIgnore + def getUserFriendlyModelName: String + + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + Map( + operatorInfo.outputPorts.head.id -> Schema() + .add("model_name", AttributeType.STRING) + .add("model", AttributeType.BINARY) + ) + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala index a00cc7ec9c8..d7daada8d92 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/sklearn/training/SklearnTrainingOpDesc.scala @@ -19,73 +19,16 @@ package org.apache.texera.amber.operator.sklearn.training -import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription} -import com.kjetland.jackson.jsonSchema.annotations.{ - JsonSchemaInject, - JsonSchemaInt, - JsonSchemaString, - JsonSchemaTitle -} -import org.apache.texera.amber.core.tuple.{AttributeType, Schema} import org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext -import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity} -import org.apache.texera.amber.operator.PythonOperatorDescriptor -import org.apache.texera.amber.operator.metadata.annotations.{ - AutofillAttributeName, - CommonOpDescAnnotation, - HideAnnotation -} import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.operator.sklearn.SklearnModelOpDesc -class SklearnTrainingOpDesc extends PythonOperatorDescriptor { - - @JsonSchemaTitle("Target Attribute") - @JsonPropertyDescription("Attribute in your dataset corresponding to target.") - @JsonProperty(required = true) - @AutofillAttributeName - var target: EncodableString = _ - - @JsonSchemaTitle("Count Vectorizer") - @JsonPropertyDescription("Convert a collection of text documents to a matrix of token counts.") - @JsonProperty(defaultValue = "false") - var countVectorizer: Boolean = false - - @JsonSchemaTitle("Text Attribute") - @JsonPropertyDescription("Attribute in your dataset with text to vectorize.") - @JsonSchemaInject( - strings = Array( - new JsonSchemaString( - path = CommonOpDescAnnotation.autofill, - value = CommonOpDescAnnotation.attributeName - ), - new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"), - new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), - new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") - ), - ints = Array( - new JsonSchemaInt(path = CommonOpDescAnnotation.autofillAttributeOnPort, value = 0) - ) - ) - var text: EncodableString = _ - - @JsonSchemaTitle("Tfidf Transformer") - @JsonPropertyDescription("Transform a count matrix to a normalized tf or tf-idf representation.") - @JsonProperty(defaultValue = "false") - @JsonSchemaInject( - strings = Array( - new JsonSchemaString(path = HideAnnotation.hideTarget, value = "countVectorizer"), - new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), - new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") - ) - ) - var tfidfTransformer: Boolean = false +class SklearnTrainingOpDesc extends SklearnModelOpDesc { - @JsonIgnore - def getImportStatements = "from sklearn.ensemble import RandomForestClassifier" + override def getImportStatements = "from sklearn.ensemble import RandomForestClassifier" - @JsonIgnore - def getUserFriendlyModelName = "RandomForest Training" + override def getUserFriendlyModelName = "RandomForest Training" override def generatePythonCode(): String = pyb"""$getImportStatements @@ -115,14 +58,4 @@ class SklearnTrainingOpDesc extends PythonOperatorDescriptor { inputPorts = List(InputPort(PortIdentity(), "training")), outputPorts = List(OutputPort(blocking = true)) ) - - override def getOutputSchemas( - inputSchemas: Map[PortIdentity, Schema] - ): Map[PortIdentity, Schema] = { - Map( - operatorInfo.outputPorts.head.id -> Schema() - .add("model_name", AttributeType.STRING) - .add("model", AttributeType.BINARY) - ) - } } diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala index 29e80162d6b..d0598b92319 100644 --- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala @@ -166,7 +166,7 @@ class SklearnOpDescRegistrySpec extends AnyFlatSpec { val desc = new SklearnLogisticRegressionOpDesc() desc.target = "y" desc.countVectorizer = false - // `tfidfTransformer` is a val on the base class, defaults to false. + // `tfidfTransformer` is defined on the shared base class, defaults to false. val code = desc.generatePythonCode() assert(code.contains("from sklearn.linear_model import LogisticRegression")) // Classifier OpDescs emit a UDFTableOperator pipeline.